{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.0, "eval_steps": 500, "global_step": 609492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001968852749502865, "grad_norm": 1.9081757068634033, "learning_rate": 9.999999655665828e-07, "loss": 1.9557, "num_input_tokens_seen": 102400, "step": 100 }, { "epoch": 0.00393770549900573, "grad_norm": 2.0245535373687744, "learning_rate": 9.999998465374926e-07, "loss": 2.0835, "num_input_tokens_seen": 202416, "step": 200 }, { "epoch": 0.005906558248508594, "grad_norm": 1.8482612371444702, "learning_rate": 9.999996424876457e-07, "loss": 1.9022, "num_input_tokens_seen": 304816, "step": 300 }, { "epoch": 0.00787541099801146, "grad_norm": 1.8051942586898804, "learning_rate": 9.99999353417077e-07, "loss": 1.9903, "num_input_tokens_seen": 407216, "step": 400 }, { "epoch": 0.009844263747514324, "grad_norm": 2.291351795196533, "learning_rate": 9.999989793258351e-07, "loss": 1.9607, "num_input_tokens_seen": 509368, "step": 500 }, { "epoch": 0.011813116497017188, "grad_norm": 1.9627857208251953, "learning_rate": 9.999985202139843e-07, "loss": 1.9747, "num_input_tokens_seen": 611152, "step": 600 }, { "epoch": 0.013781969246520052, "grad_norm": 1.911093831062317, "learning_rate": 9.99997976081602e-07, "loss": 1.9834, "num_input_tokens_seen": 712304, "step": 700 }, { "epoch": 0.01575082199602292, "grad_norm": 1.9051035642623901, "learning_rate": 9.999973469287812e-07, "loss": 1.9282, "num_input_tokens_seen": 814040, "step": 800 }, { "epoch": 0.01771967474552578, "grad_norm": 1.8340057134628296, "learning_rate": 9.999966327556286e-07, "loss": 1.9803, "num_input_tokens_seen": 915376, "step": 900 }, { "epoch": 0.019688527495028647, "grad_norm": 2.0401248931884766, "learning_rate": 9.99995833562266e-07, "loss": 2.0006, "num_input_tokens_seen": 1017776, "step": 1000 }, { "epoch": 0.02165738024453151, "grad_norm": 2.080766201019287, "learning_rate": 9.999949493488289e-07, "loss": 2.0001, "num_input_tokens_seen": 1120176, "step": 1100 }, { "epoch": 0.023626232994034376, "grad_norm": 1.8966131210327148, "learning_rate": 9.999939801154677e-07, "loss": 2.0123, "num_input_tokens_seen": 1221976, "step": 1200 }, { "epoch": 0.025595085743537242, "grad_norm": 2.087946653366089, "learning_rate": 9.999929258623475e-07, "loss": 1.9841, "num_input_tokens_seen": 1324376, "step": 1300 }, { "epoch": 0.027563938493040105, "grad_norm": 4.0202460289001465, "learning_rate": 9.999917865896474e-07, "loss": 1.9815, "num_input_tokens_seen": 1425480, "step": 1400 }, { "epoch": 0.02953279124254297, "grad_norm": 1.7611056566238403, "learning_rate": 9.99990562297561e-07, "loss": 2.0264, "num_input_tokens_seen": 1525512, "step": 1500 }, { "epoch": 0.03150164399204584, "grad_norm": 2.0431771278381348, "learning_rate": 9.999892529862967e-07, "loss": 2.0026, "num_input_tokens_seen": 1627912, "step": 1600 }, { "epoch": 0.0334704967415487, "grad_norm": 1.9018654823303223, "learning_rate": 9.999878586560772e-07, "loss": 2.0291, "num_input_tokens_seen": 1728800, "step": 1700 }, { "epoch": 0.03543934949105156, "grad_norm": 1.7411730289459229, "learning_rate": 9.999863793071392e-07, "loss": 2.0356, "num_input_tokens_seen": 1828824, "step": 1800 }, { "epoch": 0.03740820224055443, "grad_norm": 1.8061546087265015, "learning_rate": 9.999848149397345e-07, "loss": 1.9708, "num_input_tokens_seen": 1929944, "step": 1900 }, { "epoch": 0.039377054990057295, "grad_norm": 1.8095059394836426, "learning_rate": 9.999831655541291e-07, "loss": 1.969, "num_input_tokens_seen": 2032256, "step": 2000 }, { "epoch": 0.04134590773956016, "grad_norm": 1.8762112855911255, "learning_rate": 9.999814311506033e-07, "loss": 1.9973, "num_input_tokens_seen": 2134656, "step": 2100 }, { "epoch": 0.04331476048906302, "grad_norm": 1.7770034074783325, "learning_rate": 9.999796117294525e-07, "loss": 1.9728, "num_input_tokens_seen": 2237056, "step": 2200 }, { "epoch": 0.04528361323856589, "grad_norm": 8.828059196472168, "learning_rate": 9.999777072909855e-07, "loss": 1.9541, "num_input_tokens_seen": 2338688, "step": 2300 }, { "epoch": 0.04725246598806875, "grad_norm": 1.8785701990127563, "learning_rate": 9.999757178355265e-07, "loss": 1.9632, "num_input_tokens_seen": 2441088, "step": 2400 }, { "epoch": 0.049221318737571615, "grad_norm": 1.9118993282318115, "learning_rate": 9.999736433634135e-07, "loss": 2.0193, "num_input_tokens_seen": 2543488, "step": 2500 }, { "epoch": 0.051190171487074485, "grad_norm": 2.069748878479004, "learning_rate": 9.999714838749996e-07, "loss": 1.9967, "num_input_tokens_seen": 2645664, "step": 2600 }, { "epoch": 0.05315902423657735, "grad_norm": 1.9387198686599731, "learning_rate": 9.999692393706517e-07, "loss": 1.9902, "num_input_tokens_seen": 2748064, "step": 2700 }, { "epoch": 0.05512787698608021, "grad_norm": 1.8751716613769531, "learning_rate": 9.999669098507517e-07, "loss": 1.9793, "num_input_tokens_seen": 2850464, "step": 2800 }, { "epoch": 0.05709672973558307, "grad_norm": 1.972404956817627, "learning_rate": 9.999644953156955e-07, "loss": 2.0275, "num_input_tokens_seen": 2952040, "step": 2900 }, { "epoch": 0.05906558248508594, "grad_norm": 1.9006932973861694, "learning_rate": 9.999619957658938e-07, "loss": 1.9634, "num_input_tokens_seen": 3054440, "step": 3000 }, { "epoch": 0.061034435234588805, "grad_norm": 2.1948349475860596, "learning_rate": 9.999594112017717e-07, "loss": 2.0436, "num_input_tokens_seen": 3155968, "step": 3100 }, { "epoch": 0.06300328798409167, "grad_norm": 1.8219056129455566, "learning_rate": 9.999567416237685e-07, "loss": 2.0317, "num_input_tokens_seen": 3256848, "step": 3200 }, { "epoch": 0.06497214073359453, "grad_norm": 2.4099738597869873, "learning_rate": 9.99953987032338e-07, "loss": 1.9925, "num_input_tokens_seen": 3359048, "step": 3300 }, { "epoch": 0.0669409934830974, "grad_norm": 2.0764834880828857, "learning_rate": 9.99951147427949e-07, "loss": 2.0083, "num_input_tokens_seen": 3459880, "step": 3400 }, { "epoch": 0.06890984623260027, "grad_norm": 1.985681176185608, "learning_rate": 9.999482228110844e-07, "loss": 1.9774, "num_input_tokens_seen": 3561432, "step": 3500 }, { "epoch": 0.07087869898210312, "grad_norm": 2.1401822566986084, "learning_rate": 9.99945213182241e-07, "loss": 2.0003, "num_input_tokens_seen": 3663832, "step": 3600 }, { "epoch": 0.072847551731606, "grad_norm": 1.6803908348083496, "learning_rate": 9.999421185419309e-07, "loss": 2.0152, "num_input_tokens_seen": 3765496, "step": 3700 }, { "epoch": 0.07481640448110886, "grad_norm": 1.7712147235870361, "learning_rate": 9.9993893889068e-07, "loss": 1.9871, "num_input_tokens_seen": 3866968, "step": 3800 }, { "epoch": 0.07678525723061172, "grad_norm": 1.8583042621612549, "learning_rate": 9.999356742290296e-07, "loss": 1.9997, "num_input_tokens_seen": 3968728, "step": 3900 }, { "epoch": 0.07875410998011459, "grad_norm": 1.827921986579895, "learning_rate": 9.999323245575343e-07, "loss": 1.9609, "num_input_tokens_seen": 4071128, "step": 4000 }, { "epoch": 0.08072296272961745, "grad_norm": 1.7819452285766602, "learning_rate": 9.999288898767638e-07, "loss": 2.0073, "num_input_tokens_seen": 4172968, "step": 4100 }, { "epoch": 0.08269181547912031, "grad_norm": 2.0420584678649902, "learning_rate": 9.999253701873024e-07, "loss": 1.9616, "num_input_tokens_seen": 4275368, "step": 4200 }, { "epoch": 0.08466066822862318, "grad_norm": 1.8458151817321777, "learning_rate": 9.999217654897481e-07, "loss": 1.976, "num_input_tokens_seen": 4377768, "step": 4300 }, { "epoch": 0.08662952097812604, "grad_norm": 1.8586608171463013, "learning_rate": 9.999180757847142e-07, "loss": 1.9941, "num_input_tokens_seen": 4478712, "step": 4400 }, { "epoch": 0.08859837372762891, "grad_norm": 2.039126396179199, "learning_rate": 9.999143010728277e-07, "loss": 1.981, "num_input_tokens_seen": 4581112, "step": 4500 }, { "epoch": 0.09056722647713178, "grad_norm": 1.9401798248291016, "learning_rate": 9.999104413547313e-07, "loss": 1.9281, "num_input_tokens_seen": 4683512, "step": 4600 }, { "epoch": 0.09253607922663463, "grad_norm": 1.9561400413513184, "learning_rate": 9.999064966310804e-07, "loss": 2.0088, "num_input_tokens_seen": 4785088, "step": 4700 }, { "epoch": 0.0945049319761375, "grad_norm": 2.128077268600464, "learning_rate": 9.999024669025464e-07, "loss": 2.0149, "num_input_tokens_seen": 4886864, "step": 4800 }, { "epoch": 0.09647378472564037, "grad_norm": 2.0094833374023438, "learning_rate": 9.998983521698141e-07, "loss": 2.034, "num_input_tokens_seen": 4989264, "step": 4900 }, { "epoch": 0.09844263747514323, "grad_norm": 1.8435900211334229, "learning_rate": 9.998941524335834e-07, "loss": 1.9854, "num_input_tokens_seen": 5091664, "step": 5000 }, { "epoch": 0.1004114902246461, "grad_norm": 1.8683404922485352, "learning_rate": 9.998898676945684e-07, "loss": 1.9672, "num_input_tokens_seen": 5192240, "step": 5100 }, { "epoch": 0.10238034297414897, "grad_norm": 1.8303996324539185, "learning_rate": 9.998854979534977e-07, "loss": 1.9863, "num_input_tokens_seen": 5293912, "step": 5200 }, { "epoch": 0.10434919572365182, "grad_norm": 1.9261983633041382, "learning_rate": 9.998810432111144e-07, "loss": 1.9907, "num_input_tokens_seen": 5395552, "step": 5300 }, { "epoch": 0.1063180484731547, "grad_norm": 1.674626350402832, "learning_rate": 9.998765034681758e-07, "loss": 2.031, "num_input_tokens_seen": 5496816, "step": 5400 }, { "epoch": 0.10828690122265756, "grad_norm": 1.9371134042739868, "learning_rate": 9.998718787254539e-07, "loss": 1.9305, "num_input_tokens_seen": 5598768, "step": 5500 }, { "epoch": 0.11025575397216042, "grad_norm": 5.228818893432617, "learning_rate": 9.99867168983735e-07, "loss": 1.9867, "num_input_tokens_seen": 5701168, "step": 5600 }, { "epoch": 0.11222460672166329, "grad_norm": 1.9057998657226562, "learning_rate": 9.998623742438202e-07, "loss": 1.9319, "num_input_tokens_seen": 5803568, "step": 5700 }, { "epoch": 0.11419345947116614, "grad_norm": 2.1413800716400146, "learning_rate": 9.998574945065248e-07, "loss": 1.9946, "num_input_tokens_seen": 5905968, "step": 5800 }, { "epoch": 0.11616231222066901, "grad_norm": 2.0873007774353027, "learning_rate": 9.998525297726783e-07, "loss": 1.938, "num_input_tokens_seen": 6008368, "step": 5900 }, { "epoch": 0.11813116497017188, "grad_norm": 1.8770489692687988, "learning_rate": 9.998474800431252e-07, "loss": 2.0306, "num_input_tokens_seen": 6109176, "step": 6000 }, { "epoch": 0.12010001771967474, "grad_norm": 2.004056692123413, "learning_rate": 9.99842345318724e-07, "loss": 1.9941, "num_input_tokens_seen": 6210776, "step": 6100 }, { "epoch": 0.12206887046917761, "grad_norm": 2.201301336288452, "learning_rate": 9.998371256003478e-07, "loss": 1.9467, "num_input_tokens_seen": 6312432, "step": 6200 }, { "epoch": 0.12403772321868048, "grad_norm": 1.8141396045684814, "learning_rate": 9.998318208888844e-07, "loss": 2.0164, "num_input_tokens_seen": 6413424, "step": 6300 }, { "epoch": 0.12600657596818335, "grad_norm": 1.8870656490325928, "learning_rate": 9.998264311852353e-07, "loss": 1.9963, "num_input_tokens_seen": 6515200, "step": 6400 }, { "epoch": 0.1279754287176862, "grad_norm": 2.036982774734497, "learning_rate": 9.998209564903175e-07, "loss": 1.9512, "num_input_tokens_seen": 6616832, "step": 6500 }, { "epoch": 0.12994428146718906, "grad_norm": 2.0326342582702637, "learning_rate": 9.998153968050617e-07, "loss": 2.0194, "num_input_tokens_seen": 6719232, "step": 6600 }, { "epoch": 0.13191313421669193, "grad_norm": 1.8858219385147095, "learning_rate": 9.998097521304132e-07, "loss": 1.9661, "num_input_tokens_seen": 6821632, "step": 6700 }, { "epoch": 0.1338819869661948, "grad_norm": 2.1061222553253174, "learning_rate": 9.998040224673321e-07, "loss": 2.0127, "num_input_tokens_seen": 6924032, "step": 6800 }, { "epoch": 0.13585083971569767, "grad_norm": 2.0696656703948975, "learning_rate": 9.997982078167925e-07, "loss": 1.9279, "num_input_tokens_seen": 7025856, "step": 6900 }, { "epoch": 0.13781969246520054, "grad_norm": 1.8722087144851685, "learning_rate": 9.997923081797832e-07, "loss": 2.0707, "num_input_tokens_seen": 7127656, "step": 7000 }, { "epoch": 0.13978854521470338, "grad_norm": 1.7816176414489746, "learning_rate": 9.997863235573072e-07, "loss": 1.9551, "num_input_tokens_seen": 7230056, "step": 7100 }, { "epoch": 0.14175739796420625, "grad_norm": 8.757242202758789, "learning_rate": 9.997802539503824e-07, "loss": 1.9781, "num_input_tokens_seen": 7331696, "step": 7200 }, { "epoch": 0.14372625071370912, "grad_norm": 2.2203147411346436, "learning_rate": 9.997740993600406e-07, "loss": 2.0103, "num_input_tokens_seen": 7433472, "step": 7300 }, { "epoch": 0.145695103463212, "grad_norm": 1.7229341268539429, "learning_rate": 9.997678597873285e-07, "loss": 2.0444, "num_input_tokens_seen": 7534736, "step": 7400 }, { "epoch": 0.14766395621271486, "grad_norm": 2.069223165512085, "learning_rate": 9.997615352333072e-07, "loss": 1.9925, "num_input_tokens_seen": 7637136, "step": 7500 }, { "epoch": 0.14963280896221773, "grad_norm": 2.1618645191192627, "learning_rate": 9.99755125699052e-07, "loss": 1.9466, "num_input_tokens_seen": 7739536, "step": 7600 }, { "epoch": 0.15160166171172057, "grad_norm": 1.8824418783187866, "learning_rate": 9.99748631185653e-07, "loss": 1.9693, "num_input_tokens_seen": 7841640, "step": 7700 }, { "epoch": 0.15357051446122344, "grad_norm": 1.7198357582092285, "learning_rate": 9.99742051694214e-07, "loss": 2.0451, "num_input_tokens_seen": 7943032, "step": 7800 }, { "epoch": 0.1555393672107263, "grad_norm": 1.7849814891815186, "learning_rate": 9.997353872258542e-07, "loss": 1.9811, "num_input_tokens_seen": 8044224, "step": 7900 }, { "epoch": 0.15750821996022918, "grad_norm": 1.9055421352386475, "learning_rate": 9.997286377817067e-07, "loss": 2.0199, "num_input_tokens_seen": 8145968, "step": 8000 }, { "epoch": 0.15947707270973205, "grad_norm": 2.1569740772247314, "learning_rate": 9.997218033629195e-07, "loss": 2.0, "num_input_tokens_seen": 8247792, "step": 8100 }, { "epoch": 0.1614459254592349, "grad_norm": 2.0451645851135254, "learning_rate": 9.997148839706545e-07, "loss": 1.9879, "num_input_tokens_seen": 8349264, "step": 8200 }, { "epoch": 0.16341477820873776, "grad_norm": 1.8271421194076538, "learning_rate": 9.99707879606088e-07, "loss": 1.9729, "num_input_tokens_seen": 8451152, "step": 8300 }, { "epoch": 0.16538363095824063, "grad_norm": 1.8655251264572144, "learning_rate": 9.997007902704115e-07, "loss": 2.0167, "num_input_tokens_seen": 8553040, "step": 8400 }, { "epoch": 0.1673524837077435, "grad_norm": 1.8250237703323364, "learning_rate": 9.996936159648302e-07, "loss": 2.0309, "num_input_tokens_seen": 8655440, "step": 8500 }, { "epoch": 0.16932133645724637, "grad_norm": 2.0580437183380127, "learning_rate": 9.99686356690564e-07, "loss": 1.9358, "num_input_tokens_seen": 8757840, "step": 8600 }, { "epoch": 0.17129018920674924, "grad_norm": 2.1230578422546387, "learning_rate": 9.996790124488477e-07, "loss": 1.9934, "num_input_tokens_seen": 8858664, "step": 8700 }, { "epoch": 0.17325904195625208, "grad_norm": 1.9461112022399902, "learning_rate": 9.996715832409295e-07, "loss": 1.9581, "num_input_tokens_seen": 8961064, "step": 8800 }, { "epoch": 0.17522789470575495, "grad_norm": 1.7925480604171753, "learning_rate": 9.996640690680732e-07, "loss": 1.9722, "num_input_tokens_seen": 9062776, "step": 8900 }, { "epoch": 0.17719674745525782, "grad_norm": 1.9623992443084717, "learning_rate": 9.996564699315561e-07, "loss": 1.9504, "num_input_tokens_seen": 9165176, "step": 9000 }, { "epoch": 0.1791656002047607, "grad_norm": 1.958788514137268, "learning_rate": 9.996487858326708e-07, "loss": 1.9981, "num_input_tokens_seen": 9266312, "step": 9100 }, { "epoch": 0.18113445295426356, "grad_norm": 1.8302428722381592, "learning_rate": 9.996410167727236e-07, "loss": 1.9342, "num_input_tokens_seen": 9367936, "step": 9200 }, { "epoch": 0.18310330570376643, "grad_norm": 1.993533730506897, "learning_rate": 9.99633162753036e-07, "loss": 1.9561, "num_input_tokens_seen": 9470336, "step": 9300 }, { "epoch": 0.18507215845326927, "grad_norm": 2.04597544670105, "learning_rate": 9.996252237749427e-07, "loss": 1.9991, "num_input_tokens_seen": 9572736, "step": 9400 }, { "epoch": 0.18704101120277214, "grad_norm": 1.8397719860076904, "learning_rate": 9.996171998397944e-07, "loss": 1.9686, "num_input_tokens_seen": 9675136, "step": 9500 }, { "epoch": 0.189009863952275, "grad_norm": 1.842245101928711, "learning_rate": 9.996090909489552e-07, "loss": 1.97, "num_input_tokens_seen": 9777536, "step": 9600 }, { "epoch": 0.19097871670177788, "grad_norm": 2.032658576965332, "learning_rate": 9.99600897103804e-07, "loss": 1.9829, "num_input_tokens_seen": 9879392, "step": 9700 }, { "epoch": 0.19294756945128075, "grad_norm": 1.9740447998046875, "learning_rate": 9.995926183057338e-07, "loss": 1.9965, "num_input_tokens_seen": 9980648, "step": 9800 }, { "epoch": 0.1949164222007836, "grad_norm": 1.9238474369049072, "learning_rate": 9.995842545561527e-07, "loss": 1.981, "num_input_tokens_seen": 10083048, "step": 9900 }, { "epoch": 0.19688527495028646, "grad_norm": 2.00785493850708, "learning_rate": 9.995758058564832e-07, "loss": 1.9643, "num_input_tokens_seen": 10184864, "step": 10000 }, { "epoch": 0.19885412769978933, "grad_norm": 2.054298162460327, "learning_rate": 9.99567272208161e-07, "loss": 1.9848, "num_input_tokens_seen": 10287264, "step": 10100 }, { "epoch": 0.2008229804492922, "grad_norm": 1.957642912864685, "learning_rate": 9.995586536126381e-07, "loss": 1.9503, "num_input_tokens_seen": 10389664, "step": 10200 }, { "epoch": 0.20279183319879507, "grad_norm": 1.7643107175827026, "learning_rate": 9.995499500713795e-07, "loss": 1.9629, "num_input_tokens_seen": 10491440, "step": 10300 }, { "epoch": 0.20476068594829794, "grad_norm": 3.147120714187622, "learning_rate": 9.995411615858654e-07, "loss": 1.9909, "num_input_tokens_seen": 10593840, "step": 10400 }, { "epoch": 0.20672953869780078, "grad_norm": 1.8852829933166504, "learning_rate": 9.995322881575898e-07, "loss": 2.0632, "num_input_tokens_seen": 10694928, "step": 10500 }, { "epoch": 0.20869839144730365, "grad_norm": 1.8616101741790771, "learning_rate": 9.99523329788062e-07, "loss": 2.0156, "num_input_tokens_seen": 10796552, "step": 10600 }, { "epoch": 0.21066724419680652, "grad_norm": 1.8704726696014404, "learning_rate": 9.995142864788051e-07, "loss": 1.9922, "num_input_tokens_seen": 10898160, "step": 10700 }, { "epoch": 0.2126360969463094, "grad_norm": 2.0497450828552246, "learning_rate": 9.99505158231357e-07, "loss": 1.9887, "num_input_tokens_seen": 11000560, "step": 10800 }, { "epoch": 0.21460494969581226, "grad_norm": 1.9596359729766846, "learning_rate": 9.994959450472697e-07, "loss": 1.9892, "num_input_tokens_seen": 11101736, "step": 10900 }, { "epoch": 0.21657380244531513, "grad_norm": 1.9173911809921265, "learning_rate": 9.9948664692811e-07, "loss": 1.999, "num_input_tokens_seen": 11203232, "step": 11000 }, { "epoch": 0.21854265519481797, "grad_norm": 1.9784644842147827, "learning_rate": 9.994772638754587e-07, "loss": 2.0029, "num_input_tokens_seen": 11305632, "step": 11100 }, { "epoch": 0.22051150794432084, "grad_norm": 2.05433988571167, "learning_rate": 9.994677958909117e-07, "loss": 1.9623, "num_input_tokens_seen": 11408032, "step": 11200 }, { "epoch": 0.2224803606938237, "grad_norm": 2.146061658859253, "learning_rate": 9.994582429760785e-07, "loss": 1.9939, "num_input_tokens_seen": 11510016, "step": 11300 }, { "epoch": 0.22444921344332658, "grad_norm": 1.7213598489761353, "learning_rate": 9.994486051325837e-07, "loss": 1.9569, "num_input_tokens_seen": 11611544, "step": 11400 }, { "epoch": 0.22641806619282945, "grad_norm": 1.723879337310791, "learning_rate": 9.99438882362066e-07, "loss": 2.0075, "num_input_tokens_seen": 11713528, "step": 11500 }, { "epoch": 0.2283869189423323, "grad_norm": 2.2458531856536865, "learning_rate": 9.99429074666179e-07, "loss": 1.9548, "num_input_tokens_seen": 11814312, "step": 11600 }, { "epoch": 0.23035577169183516, "grad_norm": 2.29547381401062, "learning_rate": 9.994191820465902e-07, "loss": 2.0178, "num_input_tokens_seen": 11916712, "step": 11700 }, { "epoch": 0.23232462444133803, "grad_norm": 1.716240644454956, "learning_rate": 9.99409204504982e-07, "loss": 1.9757, "num_input_tokens_seen": 12019112, "step": 11800 }, { "epoch": 0.2342934771908409, "grad_norm": 1.7362383604049683, "learning_rate": 9.993991420430506e-07, "loss": 1.9657, "num_input_tokens_seen": 12120656, "step": 11900 }, { "epoch": 0.23626232994034377, "grad_norm": 1.9518760442733765, "learning_rate": 9.993889946625073e-07, "loss": 1.9828, "num_input_tokens_seen": 12223056, "step": 12000 }, { "epoch": 0.23823118268984664, "grad_norm": 1.9001251459121704, "learning_rate": 9.993787623650773e-07, "loss": 1.9723, "num_input_tokens_seen": 12324856, "step": 12100 }, { "epoch": 0.24020003543934948, "grad_norm": 2.2148642539978027, "learning_rate": 9.993684451525008e-07, "loss": 2.0026, "num_input_tokens_seen": 12427256, "step": 12200 }, { "epoch": 0.24216888818885235, "grad_norm": 2.0185770988464355, "learning_rate": 9.993580430265321e-07, "loss": 1.9573, "num_input_tokens_seen": 12529656, "step": 12300 }, { "epoch": 0.24413774093835522, "grad_norm": 1.9938510656356812, "learning_rate": 9.993475559889401e-07, "loss": 1.9974, "num_input_tokens_seen": 12631136, "step": 12400 }, { "epoch": 0.2461065936878581, "grad_norm": 1.89698326587677, "learning_rate": 9.993369840415076e-07, "loss": 1.9587, "num_input_tokens_seen": 12732168, "step": 12500 }, { "epoch": 0.24807544643736096, "grad_norm": 2.3082284927368164, "learning_rate": 9.993263271860328e-07, "loss": 1.9721, "num_input_tokens_seen": 12833992, "step": 12600 }, { "epoch": 0.2500442991868638, "grad_norm": 2.106632947921753, "learning_rate": 9.993155854243275e-07, "loss": 2.0185, "num_input_tokens_seen": 12934888, "step": 12700 }, { "epoch": 0.2520131519363667, "grad_norm": 2.0504987239837646, "learning_rate": 9.993047587582184e-07, "loss": 1.9909, "num_input_tokens_seen": 13035984, "step": 12800 }, { "epoch": 0.25398200468586957, "grad_norm": 2.0314905643463135, "learning_rate": 9.992938471895462e-07, "loss": 2.1097, "num_input_tokens_seen": 13137528, "step": 12900 }, { "epoch": 0.2559508574353724, "grad_norm": 2.0496342182159424, "learning_rate": 9.992828507201666e-07, "loss": 1.9904, "num_input_tokens_seen": 13239928, "step": 13000 }, { "epoch": 0.25791971018487525, "grad_norm": 1.7611814737319946, "learning_rate": 9.992717693519497e-07, "loss": 1.9373, "num_input_tokens_seen": 13342328, "step": 13100 }, { "epoch": 0.2598885629343781, "grad_norm": 1.7378196716308594, "learning_rate": 9.992606030867792e-07, "loss": 1.974, "num_input_tokens_seen": 13443976, "step": 13200 }, { "epoch": 0.261857415683881, "grad_norm": 1.9088518619537354, "learning_rate": 9.99249351926554e-07, "loss": 1.9461, "num_input_tokens_seen": 13546376, "step": 13300 }, { "epoch": 0.26382626843338386, "grad_norm": 1.9952328205108643, "learning_rate": 9.992380158731875e-07, "loss": 1.9667, "num_input_tokens_seen": 13648776, "step": 13400 }, { "epoch": 0.26579512118288673, "grad_norm": 2.2141008377075195, "learning_rate": 9.99226594928607e-07, "loss": 1.9328, "num_input_tokens_seen": 13751176, "step": 13500 }, { "epoch": 0.2677639739323896, "grad_norm": 2.5217039585113525, "learning_rate": 9.992150890947551e-07, "loss": 1.9817, "num_input_tokens_seen": 13853576, "step": 13600 }, { "epoch": 0.26973282668189247, "grad_norm": 1.8611009120941162, "learning_rate": 9.992034983735876e-07, "loss": 1.9639, "num_input_tokens_seen": 13955480, "step": 13700 }, { "epoch": 0.27170167943139534, "grad_norm": 2.1035237312316895, "learning_rate": 9.991918227670757e-07, "loss": 1.9584, "num_input_tokens_seen": 14057880, "step": 13800 }, { "epoch": 0.2736705321808982, "grad_norm": 1.857049822807312, "learning_rate": 9.991800622772047e-07, "loss": 1.9333, "num_input_tokens_seen": 14160280, "step": 13900 }, { "epoch": 0.2756393849304011, "grad_norm": 1.952817678451538, "learning_rate": 9.991682169059744e-07, "loss": 1.9996, "num_input_tokens_seen": 14262680, "step": 14000 }, { "epoch": 0.27760823767990395, "grad_norm": 1.6179795265197754, "learning_rate": 9.99156286655399e-07, "loss": 1.9756, "num_input_tokens_seen": 14364768, "step": 14100 }, { "epoch": 0.27957709042940676, "grad_norm": 2.3380112648010254, "learning_rate": 9.991442715275072e-07, "loss": 1.9927, "num_input_tokens_seen": 14467168, "step": 14200 }, { "epoch": 0.28154594317890963, "grad_norm": 1.9956783056259155, "learning_rate": 9.99132171524342e-07, "loss": 2.0169, "num_input_tokens_seen": 14569568, "step": 14300 }, { "epoch": 0.2835147959284125, "grad_norm": 2.460737466812134, "learning_rate": 9.991199866479607e-07, "loss": 1.9812, "num_input_tokens_seen": 14671968, "step": 14400 }, { "epoch": 0.28548364867791537, "grad_norm": 1.8070951700210571, "learning_rate": 9.991077169004355e-07, "loss": 1.9598, "num_input_tokens_seen": 14773656, "step": 14500 }, { "epoch": 0.28745250142741824, "grad_norm": 2.0932130813598633, "learning_rate": 9.99095362283853e-07, "loss": 1.9954, "num_input_tokens_seen": 14875680, "step": 14600 }, { "epoch": 0.2894213541769211, "grad_norm": 1.738515853881836, "learning_rate": 9.990829228003136e-07, "loss": 1.9532, "num_input_tokens_seen": 14978080, "step": 14700 }, { "epoch": 0.291390206926424, "grad_norm": 1.9239839315414429, "learning_rate": 9.990703984519324e-07, "loss": 2.0163, "num_input_tokens_seen": 15080480, "step": 14800 }, { "epoch": 0.29335905967592685, "grad_norm": 1.837448000907898, "learning_rate": 9.990577892408396e-07, "loss": 1.9416, "num_input_tokens_seen": 15182336, "step": 14900 }, { "epoch": 0.2953279124254297, "grad_norm": 1.738576889038086, "learning_rate": 9.990450951691786e-07, "loss": 2.0071, "num_input_tokens_seen": 15283712, "step": 15000 }, { "epoch": 0.2972967651749326, "grad_norm": 2.089157819747925, "learning_rate": 9.990323162391087e-07, "loss": 1.9592, "num_input_tokens_seen": 15385432, "step": 15100 }, { "epoch": 0.29926561792443546, "grad_norm": 1.9610596895217896, "learning_rate": 9.990194524528024e-07, "loss": 1.8992, "num_input_tokens_seen": 15487832, "step": 15200 }, { "epoch": 0.30123447067393827, "grad_norm": 1.824532151222229, "learning_rate": 9.99006503812447e-07, "loss": 1.9654, "num_input_tokens_seen": 15589736, "step": 15300 }, { "epoch": 0.30320332342344114, "grad_norm": 1.8911042213439941, "learning_rate": 9.989934703202444e-07, "loss": 1.9673, "num_input_tokens_seen": 15692136, "step": 15400 }, { "epoch": 0.305172176172944, "grad_norm": 2.0679547786712646, "learning_rate": 9.98980351978411e-07, "loss": 2.0, "num_input_tokens_seen": 15793344, "step": 15500 }, { "epoch": 0.3071410289224469, "grad_norm": 2.1341922283172607, "learning_rate": 9.989671487891773e-07, "loss": 2.0583, "num_input_tokens_seen": 15894016, "step": 15600 }, { "epoch": 0.30910988167194975, "grad_norm": 2.13081955909729, "learning_rate": 9.989538607547883e-07, "loss": 1.9946, "num_input_tokens_seen": 15994488, "step": 15700 }, { "epoch": 0.3110787344214526, "grad_norm": 1.6867297887802124, "learning_rate": 9.989404878775038e-07, "loss": 1.9935, "num_input_tokens_seen": 16096888, "step": 15800 }, { "epoch": 0.3130475871709555, "grad_norm": 2.1156623363494873, "learning_rate": 9.989270301595976e-07, "loss": 1.981, "num_input_tokens_seen": 16198688, "step": 15900 }, { "epoch": 0.31501643992045836, "grad_norm": 1.995002031326294, "learning_rate": 9.98913487603358e-07, "loss": 1.9702, "num_input_tokens_seen": 16301088, "step": 16000 }, { "epoch": 0.3169852926699612, "grad_norm": 3.548793077468872, "learning_rate": 9.98899860211088e-07, "loss": 1.9911, "num_input_tokens_seen": 16402144, "step": 16100 }, { "epoch": 0.3189541454194641, "grad_norm": 1.8882970809936523, "learning_rate": 9.988861479851045e-07, "loss": 1.9966, "num_input_tokens_seen": 16503728, "step": 16200 }, { "epoch": 0.32092299816896697, "grad_norm": 2.009093999862671, "learning_rate": 9.988723509277395e-07, "loss": 1.9795, "num_input_tokens_seen": 16606128, "step": 16300 }, { "epoch": 0.3228918509184698, "grad_norm": 1.9955816268920898, "learning_rate": 9.98858469041339e-07, "loss": 1.9247, "num_input_tokens_seen": 16708528, "step": 16400 }, { "epoch": 0.32486070366797265, "grad_norm": 2.0711095333099365, "learning_rate": 9.988445023282631e-07, "loss": 2.0131, "num_input_tokens_seen": 16810928, "step": 16500 }, { "epoch": 0.3268295564174755, "grad_norm": 1.8371134996414185, "learning_rate": 9.988304507908872e-07, "loss": 1.9906, "num_input_tokens_seen": 16912496, "step": 16600 }, { "epoch": 0.3287984091669784, "grad_norm": 1.6996644735336304, "learning_rate": 9.988163144316005e-07, "loss": 1.972, "num_input_tokens_seen": 17014288, "step": 16700 }, { "epoch": 0.33076726191648126, "grad_norm": 1.89493989944458, "learning_rate": 9.988020932528065e-07, "loss": 1.9729, "num_input_tokens_seen": 17115536, "step": 16800 }, { "epoch": 0.33273611466598413, "grad_norm": 2.2083075046539307, "learning_rate": 9.987877872569239e-07, "loss": 2.0161, "num_input_tokens_seen": 17216168, "step": 16900 }, { "epoch": 0.334704967415487, "grad_norm": 1.9821158647537231, "learning_rate": 9.98773396446385e-07, "loss": 1.9248, "num_input_tokens_seen": 17318072, "step": 17000 }, { "epoch": 0.33667382016498987, "grad_norm": 2.1424949169158936, "learning_rate": 9.987589208236368e-07, "loss": 2.002, "num_input_tokens_seen": 17420472, "step": 17100 }, { "epoch": 0.33864267291449274, "grad_norm": 1.6785787343978882, "learning_rate": 9.987443603911408e-07, "loss": 1.997, "num_input_tokens_seen": 17521296, "step": 17200 }, { "epoch": 0.3406115256639956, "grad_norm": 3.9235410690307617, "learning_rate": 9.98729715151373e-07, "loss": 1.9538, "num_input_tokens_seen": 17623696, "step": 17300 }, { "epoch": 0.3425803784134985, "grad_norm": 1.839826226234436, "learning_rate": 9.987149851068237e-07, "loss": 2.0244, "num_input_tokens_seen": 17725640, "step": 17400 }, { "epoch": 0.34454923116300135, "grad_norm": 1.8692381381988525, "learning_rate": 9.987001702599974e-07, "loss": 2.046, "num_input_tokens_seen": 17828040, "step": 17500 }, { "epoch": 0.34651808391250416, "grad_norm": 1.8370298147201538, "learning_rate": 9.986852706134134e-07, "loss": 1.9298, "num_input_tokens_seen": 17930440, "step": 17600 }, { "epoch": 0.34848693666200703, "grad_norm": 2.1793007850646973, "learning_rate": 9.986702861696053e-07, "loss": 2.0293, "num_input_tokens_seen": 18031296, "step": 17700 }, { "epoch": 0.3504557894115099, "grad_norm": 2.0108096599578857, "learning_rate": 9.986552169311211e-07, "loss": 1.9718, "num_input_tokens_seen": 18132232, "step": 17800 }, { "epoch": 0.35242464216101277, "grad_norm": 1.7110337018966675, "learning_rate": 9.98640062900523e-07, "loss": 2.0083, "num_input_tokens_seen": 18233696, "step": 17900 }, { "epoch": 0.35439349491051564, "grad_norm": 1.9672855138778687, "learning_rate": 9.986248240803878e-07, "loss": 1.9991, "num_input_tokens_seen": 18335176, "step": 18000 }, { "epoch": 0.3563623476600185, "grad_norm": 1.8208098411560059, "learning_rate": 9.98609500473307e-07, "loss": 1.9941, "num_input_tokens_seen": 18437576, "step": 18100 }, { "epoch": 0.3583312004095214, "grad_norm": 2.0128719806671143, "learning_rate": 9.985940920818863e-07, "loss": 1.9779, "num_input_tokens_seen": 18539976, "step": 18200 }, { "epoch": 0.36030005315902425, "grad_norm": 2.181436777114868, "learning_rate": 9.985785989087454e-07, "loss": 1.9569, "num_input_tokens_seen": 18641568, "step": 18300 }, { "epoch": 0.3622689059085271, "grad_norm": 1.764965534210205, "learning_rate": 9.985630209565187e-07, "loss": 2.0087, "num_input_tokens_seen": 18742960, "step": 18400 }, { "epoch": 0.36423775865803, "grad_norm": 1.8703725337982178, "learning_rate": 9.985473582278558e-07, "loss": 2.0024, "num_input_tokens_seen": 18843736, "step": 18500 }, { "epoch": 0.36620661140753286, "grad_norm": 2.4369924068450928, "learning_rate": 9.985316107254193e-07, "loss": 2.0421, "num_input_tokens_seen": 18945800, "step": 18600 }, { "epoch": 0.36817546415703567, "grad_norm": 2.4326465129852295, "learning_rate": 9.985157784518873e-07, "loss": 2.0023, "num_input_tokens_seen": 19047632, "step": 18700 }, { "epoch": 0.37014431690653854, "grad_norm": 2.0959532260894775, "learning_rate": 9.98499861409952e-07, "loss": 1.9879, "num_input_tokens_seen": 19148216, "step": 18800 }, { "epoch": 0.3721131696560414, "grad_norm": 2.0876073837280273, "learning_rate": 9.984838596023195e-07, "loss": 1.9447, "num_input_tokens_seen": 19250616, "step": 18900 }, { "epoch": 0.3740820224055443, "grad_norm": 1.9188069105148315, "learning_rate": 9.984677730317112e-07, "loss": 1.9803, "num_input_tokens_seen": 19353016, "step": 19000 }, { "epoch": 0.37605087515504715, "grad_norm": 2.220883846282959, "learning_rate": 9.984516017008623e-07, "loss": 1.9828, "num_input_tokens_seen": 19455416, "step": 19100 }, { "epoch": 0.37801972790455, "grad_norm": 1.742920160293579, "learning_rate": 9.98435345612523e-07, "loss": 1.9757, "num_input_tokens_seen": 19557144, "step": 19200 }, { "epoch": 0.3799885806540529, "grad_norm": 2.3675317764282227, "learning_rate": 9.98419004769457e-07, "loss": 1.9925, "num_input_tokens_seen": 19659544, "step": 19300 }, { "epoch": 0.38195743340355576, "grad_norm": 1.9253923892974854, "learning_rate": 9.98402579174443e-07, "loss": 1.9984, "num_input_tokens_seen": 19760632, "step": 19400 }, { "epoch": 0.3839262861530586, "grad_norm": 1.7223613262176514, "learning_rate": 9.98386068830274e-07, "loss": 1.9942, "num_input_tokens_seen": 19861480, "step": 19500 }, { "epoch": 0.3858951389025615, "grad_norm": 1.9526335000991821, "learning_rate": 9.983694737397579e-07, "loss": 1.9479, "num_input_tokens_seen": 19963304, "step": 19600 }, { "epoch": 0.38786399165206437, "grad_norm": 1.8235361576080322, "learning_rate": 9.98352793905716e-07, "loss": 2.0048, "num_input_tokens_seen": 20065704, "step": 19700 }, { "epoch": 0.3898328444015672, "grad_norm": 1.9568926095962524, "learning_rate": 9.983360293309849e-07, "loss": 1.9596, "num_input_tokens_seen": 20166448, "step": 19800 }, { "epoch": 0.39180169715107005, "grad_norm": 1.8626855611801147, "learning_rate": 9.983191800184153e-07, "loss": 1.9505, "num_input_tokens_seen": 20268848, "step": 19900 }, { "epoch": 0.3937705499005729, "grad_norm": 3.4124538898468018, "learning_rate": 9.98302245970872e-07, "loss": 1.9489, "num_input_tokens_seen": 20370352, "step": 20000 }, { "epoch": 0.3957394026500758, "grad_norm": 1.766021966934204, "learning_rate": 9.982852271912345e-07, "loss": 1.9811, "num_input_tokens_seen": 20472752, "step": 20100 }, { "epoch": 0.39770825539957866, "grad_norm": 1.6870958805084229, "learning_rate": 9.982681236823972e-07, "loss": 1.9737, "num_input_tokens_seen": 20575152, "step": 20200 }, { "epoch": 0.39967710814908153, "grad_norm": 1.963392972946167, "learning_rate": 9.982509354472677e-07, "loss": 1.9698, "num_input_tokens_seen": 20677552, "step": 20300 }, { "epoch": 0.4016459608985844, "grad_norm": 2.3354506492614746, "learning_rate": 9.982336624887693e-07, "loss": 1.9336, "num_input_tokens_seen": 20779952, "step": 20400 }, { "epoch": 0.40361481364808727, "grad_norm": 1.7728350162506104, "learning_rate": 9.982163048098388e-07, "loss": 2.0186, "num_input_tokens_seen": 20881720, "step": 20500 }, { "epoch": 0.40558366639759014, "grad_norm": 2.3466122150421143, "learning_rate": 9.981988624134278e-07, "loss": 2.0098, "num_input_tokens_seen": 20983992, "step": 20600 }, { "epoch": 0.407552519147093, "grad_norm": 1.976973533630371, "learning_rate": 9.981813353025023e-07, "loss": 1.9684, "num_input_tokens_seen": 21084976, "step": 20700 }, { "epoch": 0.4095213718965959, "grad_norm": 2.03411602973938, "learning_rate": 9.981637234800426e-07, "loss": 2.0056, "num_input_tokens_seen": 21186616, "step": 20800 }, { "epoch": 0.41149022464609875, "grad_norm": 2.321899890899658, "learning_rate": 9.981460269490434e-07, "loss": 1.9566, "num_input_tokens_seen": 21289016, "step": 20900 }, { "epoch": 0.41345907739560156, "grad_norm": 1.7294496297836304, "learning_rate": 9.981282457125138e-07, "loss": 1.9389, "num_input_tokens_seen": 21390608, "step": 21000 }, { "epoch": 0.41542793014510443, "grad_norm": 1.7769101858139038, "learning_rate": 9.981103797734775e-07, "loss": 1.964, "num_input_tokens_seen": 21490552, "step": 21100 }, { "epoch": 0.4173967828946073, "grad_norm": 2.246415376663208, "learning_rate": 9.980924291349724e-07, "loss": 1.9817, "num_input_tokens_seen": 21592952, "step": 21200 }, { "epoch": 0.41936563564411017, "grad_norm": 2.132946491241455, "learning_rate": 9.980743938000508e-07, "loss": 1.9842, "num_input_tokens_seen": 21695056, "step": 21300 }, { "epoch": 0.42133448839361304, "grad_norm": 1.9394687414169312, "learning_rate": 9.980562737717797e-07, "loss": 2.0426, "num_input_tokens_seen": 21796200, "step": 21400 }, { "epoch": 0.4233033411431159, "grad_norm": 1.7560745477676392, "learning_rate": 9.980380690532398e-07, "loss": 1.9813, "num_input_tokens_seen": 21898600, "step": 21500 }, { "epoch": 0.4252721938926188, "grad_norm": 1.9573732614517212, "learning_rate": 9.98019779647527e-07, "loss": 1.9945, "num_input_tokens_seen": 22000320, "step": 21600 }, { "epoch": 0.42724104664212165, "grad_norm": 1.7684223651885986, "learning_rate": 9.98001405557751e-07, "loss": 1.9862, "num_input_tokens_seen": 22101792, "step": 21700 }, { "epoch": 0.4292098993916245, "grad_norm": 2.01031231880188, "learning_rate": 9.979829467870365e-07, "loss": 1.9401, "num_input_tokens_seen": 22204192, "step": 21800 }, { "epoch": 0.4311787521411274, "grad_norm": 2.198564052581787, "learning_rate": 9.97964403338522e-07, "loss": 1.9513, "num_input_tokens_seen": 22306048, "step": 21900 }, { "epoch": 0.43314760489063026, "grad_norm": 1.8433729410171509, "learning_rate": 9.979457752153606e-07, "loss": 2.0143, "num_input_tokens_seen": 22407904, "step": 22000 }, { "epoch": 0.43511645764013307, "grad_norm": 2.1233456134796143, "learning_rate": 9.979270624207201e-07, "loss": 1.9838, "num_input_tokens_seen": 22510208, "step": 22100 }, { "epoch": 0.43708531038963594, "grad_norm": 1.9405741691589355, "learning_rate": 9.979082649577826e-07, "loss": 1.9756, "num_input_tokens_seen": 22611880, "step": 22200 }, { "epoch": 0.4390541631391388, "grad_norm": 1.918004035949707, "learning_rate": 9.97889382829744e-07, "loss": 1.9765, "num_input_tokens_seen": 22714280, "step": 22300 }, { "epoch": 0.4410230158886417, "grad_norm": 1.7071990966796875, "learning_rate": 9.97870416039815e-07, "loss": 1.9983, "num_input_tokens_seen": 22815312, "step": 22400 }, { "epoch": 0.44299186863814455, "grad_norm": 1.6820582151412964, "learning_rate": 9.978513645912214e-07, "loss": 1.9532, "num_input_tokens_seen": 22917712, "step": 22500 }, { "epoch": 0.4449607213876474, "grad_norm": 2.0030429363250732, "learning_rate": 9.978322284872021e-07, "loss": 1.9757, "num_input_tokens_seen": 23019560, "step": 22600 }, { "epoch": 0.4469295741371503, "grad_norm": 2.0032713413238525, "learning_rate": 9.978130077310113e-07, "loss": 1.973, "num_input_tokens_seen": 23121960, "step": 22700 }, { "epoch": 0.44889842688665316, "grad_norm": 1.9425948858261108, "learning_rate": 9.977937023259173e-07, "loss": 1.9391, "num_input_tokens_seen": 23224360, "step": 22800 }, { "epoch": 0.450867279636156, "grad_norm": 2.022223472595215, "learning_rate": 9.97774312275203e-07, "loss": 1.9796, "num_input_tokens_seen": 23326288, "step": 22900 }, { "epoch": 0.4528361323856589, "grad_norm": 10.623699188232422, "learning_rate": 9.977548375821649e-07, "loss": 1.9697, "num_input_tokens_seen": 23428688, "step": 23000 }, { "epoch": 0.45480498513516177, "grad_norm": 1.8131344318389893, "learning_rate": 9.977352782501151e-07, "loss": 1.9518, "num_input_tokens_seen": 23531088, "step": 23100 }, { "epoch": 0.4567738378846646, "grad_norm": 2.295611619949341, "learning_rate": 9.977156342823794e-07, "loss": 1.9559, "num_input_tokens_seen": 23633488, "step": 23200 }, { "epoch": 0.45874269063416745, "grad_norm": 1.9369465112686157, "learning_rate": 9.97695905682298e-07, "loss": 2.0245, "num_input_tokens_seen": 23735888, "step": 23300 }, { "epoch": 0.4607115433836703, "grad_norm": 1.5936909914016724, "learning_rate": 9.976760924532254e-07, "loss": 1.9733, "num_input_tokens_seen": 23837088, "step": 23400 }, { "epoch": 0.4626803961331732, "grad_norm": 2.748304605484009, "learning_rate": 9.976561945985312e-07, "loss": 1.9581, "num_input_tokens_seen": 23939488, "step": 23500 }, { "epoch": 0.46464924888267606, "grad_norm": 2.1762402057647705, "learning_rate": 9.976362121215984e-07, "loss": 2.0525, "num_input_tokens_seen": 24039880, "step": 23600 }, { "epoch": 0.4666181016321789, "grad_norm": 2.3380510807037354, "learning_rate": 9.97616145025825e-07, "loss": 1.9868, "num_input_tokens_seen": 24141800, "step": 23700 }, { "epoch": 0.4685869543816818, "grad_norm": 11.509876251220703, "learning_rate": 9.975959933146231e-07, "loss": 2.006, "num_input_tokens_seen": 24242464, "step": 23800 }, { "epoch": 0.47055580713118467, "grad_norm": 1.9859744310379028, "learning_rate": 9.975757569914196e-07, "loss": 1.9781, "num_input_tokens_seen": 24344864, "step": 23900 }, { "epoch": 0.47252465988068754, "grad_norm": 1.9662309885025024, "learning_rate": 9.975554360596554e-07, "loss": 2.0087, "num_input_tokens_seen": 24447264, "step": 24000 }, { "epoch": 0.4744935126301904, "grad_norm": 2.0596296787261963, "learning_rate": 9.975350305227858e-07, "loss": 1.9695, "num_input_tokens_seen": 24549664, "step": 24100 }, { "epoch": 0.4764623653796933, "grad_norm": 1.8109757900238037, "learning_rate": 9.975145403842806e-07, "loss": 2.0153, "num_input_tokens_seen": 24651336, "step": 24200 }, { "epoch": 0.4784312181291961, "grad_norm": 2.031705617904663, "learning_rate": 9.97493965647624e-07, "loss": 1.9615, "num_input_tokens_seen": 24753216, "step": 24300 }, { "epoch": 0.48040007087869896, "grad_norm": 1.7945663928985596, "learning_rate": 9.974733063163148e-07, "loss": 1.9781, "num_input_tokens_seen": 24854888, "step": 24400 }, { "epoch": 0.48236892362820183, "grad_norm": 1.979754090309143, "learning_rate": 9.974525623938657e-07, "loss": 2.0, "num_input_tokens_seen": 24957288, "step": 24500 }, { "epoch": 0.4843377763777047, "grad_norm": 1.6545555591583252, "learning_rate": 9.97431733883804e-07, "loss": 1.985, "num_input_tokens_seen": 25059480, "step": 24600 }, { "epoch": 0.48630662912720757, "grad_norm": 1.8531856536865234, "learning_rate": 9.974108207896715e-07, "loss": 1.962, "num_input_tokens_seen": 25161104, "step": 24700 }, { "epoch": 0.48827548187671044, "grad_norm": 1.9446638822555542, "learning_rate": 9.973898231150243e-07, "loss": 1.9843, "num_input_tokens_seen": 25262976, "step": 24800 }, { "epoch": 0.4902443346262133, "grad_norm": 3.837829351425171, "learning_rate": 9.973687408634329e-07, "loss": 1.9635, "num_input_tokens_seen": 25365376, "step": 24900 }, { "epoch": 0.4922131873757162, "grad_norm": 2.8814587593078613, "learning_rate": 9.97347574038482e-07, "loss": 1.9557, "num_input_tokens_seen": 25466552, "step": 25000 }, { "epoch": 0.49418204012521905, "grad_norm": 2.385894298553467, "learning_rate": 9.97326322643771e-07, "loss": 1.9476, "num_input_tokens_seen": 25567552, "step": 25100 }, { "epoch": 0.4961508928747219, "grad_norm": 1.806183934211731, "learning_rate": 9.973049866829136e-07, "loss": 2.011, "num_input_tokens_seen": 25669424, "step": 25200 }, { "epoch": 0.4981197456242248, "grad_norm": 2.0874178409576416, "learning_rate": 9.972835661595377e-07, "loss": 1.9997, "num_input_tokens_seen": 25771736, "step": 25300 }, { "epoch": 0.5000885983737277, "grad_norm": 1.8951822519302368, "learning_rate": 9.972620610772853e-07, "loss": 2.0229, "num_input_tokens_seen": 25872328, "step": 25400 }, { "epoch": 0.5020574511232305, "grad_norm": 1.9708080291748047, "learning_rate": 9.972404714398137e-07, "loss": 2.0027, "num_input_tokens_seen": 25972904, "step": 25500 }, { "epoch": 0.5040263038727334, "grad_norm": 2.000120162963867, "learning_rate": 9.97218797250794e-07, "loss": 1.9755, "num_input_tokens_seen": 26075304, "step": 25600 }, { "epoch": 0.5059951566222363, "grad_norm": 2.2590060234069824, "learning_rate": 9.971970385139114e-07, "loss": 1.9947, "num_input_tokens_seen": 26176432, "step": 25700 }, { "epoch": 0.5079640093717391, "grad_norm": 1.7969183921813965, "learning_rate": 9.971751952328662e-07, "loss": 2.0154, "num_input_tokens_seen": 26277160, "step": 25800 }, { "epoch": 0.509932862121242, "grad_norm": 1.7295571565628052, "learning_rate": 9.97153267411372e-07, "loss": 2.0067, "num_input_tokens_seen": 26378984, "step": 25900 }, { "epoch": 0.5119017148707448, "grad_norm": 1.9042713642120361, "learning_rate": 9.971312550531582e-07, "loss": 2.0285, "num_input_tokens_seen": 26480568, "step": 26000 }, { "epoch": 0.5138705676202476, "grad_norm": 2.275377035140991, "learning_rate": 9.971091581619674e-07, "loss": 1.9685, "num_input_tokens_seen": 26582968, "step": 26100 }, { "epoch": 0.5158394203697505, "grad_norm": 1.9059438705444336, "learning_rate": 9.970869767415572e-07, "loss": 1.9568, "num_input_tokens_seen": 26685368, "step": 26200 }, { "epoch": 0.5178082731192534, "grad_norm": 1.9786062240600586, "learning_rate": 9.97064710795699e-07, "loss": 2.0229, "num_input_tokens_seen": 26787328, "step": 26300 }, { "epoch": 0.5197771258687562, "grad_norm": 2.0805323123931885, "learning_rate": 9.970423603281792e-07, "loss": 1.9997, "num_input_tokens_seen": 26888392, "step": 26400 }, { "epoch": 0.5217459786182591, "grad_norm": 2.0707738399505615, "learning_rate": 9.970199253427984e-07, "loss": 2.0388, "num_input_tokens_seen": 26990144, "step": 26500 }, { "epoch": 0.523714831367762, "grad_norm": 2.0716702938079834, "learning_rate": 9.969974058433712e-07, "loss": 2.0231, "num_input_tokens_seen": 27091712, "step": 26600 }, { "epoch": 0.5256836841172648, "grad_norm": 2.0474820137023926, "learning_rate": 9.96974801833727e-07, "loss": 1.9902, "num_input_tokens_seen": 27194112, "step": 26700 }, { "epoch": 0.5276525368667677, "grad_norm": 1.650191068649292, "learning_rate": 9.969521133177095e-07, "loss": 2.0088, "num_input_tokens_seen": 27296512, "step": 26800 }, { "epoch": 0.5296213896162706, "grad_norm": 1.7658859491348267, "learning_rate": 9.969293402991768e-07, "loss": 1.9835, "num_input_tokens_seen": 27398144, "step": 26900 }, { "epoch": 0.5315902423657735, "grad_norm": 2.167694091796875, "learning_rate": 9.969064827820009e-07, "loss": 1.9739, "num_input_tokens_seen": 27500544, "step": 27000 }, { "epoch": 0.5335590951152763, "grad_norm": 2.014336347579956, "learning_rate": 9.968835407700687e-07, "loss": 1.9942, "num_input_tokens_seen": 27602944, "step": 27100 }, { "epoch": 0.5355279478647792, "grad_norm": 1.7608187198638916, "learning_rate": 9.968605142672813e-07, "loss": 1.9802, "num_input_tokens_seen": 27704704, "step": 27200 }, { "epoch": 0.5374968006142821, "grad_norm": 1.9191299676895142, "learning_rate": 9.968374032775542e-07, "loss": 2.0222, "num_input_tokens_seen": 27805696, "step": 27300 }, { "epoch": 0.5394656533637849, "grad_norm": 2.027268171310425, "learning_rate": 9.968142078048171e-07, "loss": 1.986, "num_input_tokens_seen": 27907048, "step": 27400 }, { "epoch": 0.5414345061132878, "grad_norm": 1.9456969499588013, "learning_rate": 9.967909278530144e-07, "loss": 1.9341, "num_input_tokens_seen": 28009448, "step": 27500 }, { "epoch": 0.5434033588627907, "grad_norm": 1.9179428815841675, "learning_rate": 9.967675634261043e-07, "loss": 1.9452, "num_input_tokens_seen": 28111848, "step": 27600 }, { "epoch": 0.5453722116122935, "grad_norm": 4.113171100616455, "learning_rate": 9.967441145280603e-07, "loss": 1.9426, "num_input_tokens_seen": 28213720, "step": 27700 }, { "epoch": 0.5473410643617964, "grad_norm": 1.9656105041503906, "learning_rate": 9.967205811628691e-07, "loss": 1.9545, "num_input_tokens_seen": 28316120, "step": 27800 }, { "epoch": 0.5493099171112993, "grad_norm": 1.8778409957885742, "learning_rate": 9.966969633345327e-07, "loss": 1.9895, "num_input_tokens_seen": 28417872, "step": 27900 }, { "epoch": 0.5512787698608022, "grad_norm": 1.7477798461914062, "learning_rate": 9.96673261047067e-07, "loss": 1.9813, "num_input_tokens_seen": 28519480, "step": 28000 }, { "epoch": 0.553247622610305, "grad_norm": 1.9219067096710205, "learning_rate": 9.966494743045026e-07, "loss": 1.9445, "num_input_tokens_seen": 28621056, "step": 28100 }, { "epoch": 0.5552164753598079, "grad_norm": 1.8571618795394897, "learning_rate": 9.96625603110884e-07, "loss": 2.0336, "num_input_tokens_seen": 28721304, "step": 28200 }, { "epoch": 0.5571853281093107, "grad_norm": 2.0712876319885254, "learning_rate": 9.9660164747027e-07, "loss": 1.9648, "num_input_tokens_seen": 28823704, "step": 28300 }, { "epoch": 0.5591541808588135, "grad_norm": 2.039795398712158, "learning_rate": 9.965776073867346e-07, "loss": 1.9886, "num_input_tokens_seen": 28926104, "step": 28400 }, { "epoch": 0.5611230336083164, "grad_norm": 2.1456973552703857, "learning_rate": 9.965534828643655e-07, "loss": 2.0234, "num_input_tokens_seen": 29027024, "step": 28500 }, { "epoch": 0.5630918863578193, "grad_norm": 1.9275834560394287, "learning_rate": 9.965292739072645e-07, "loss": 1.9691, "num_input_tokens_seen": 29128792, "step": 28600 }, { "epoch": 0.5650607391073221, "grad_norm": 2.1054723262786865, "learning_rate": 9.965049805195486e-07, "loss": 2.0016, "num_input_tokens_seen": 29230640, "step": 28700 }, { "epoch": 0.567029591856825, "grad_norm": 1.7965164184570312, "learning_rate": 9.964806027053485e-07, "loss": 1.9457, "num_input_tokens_seen": 29332304, "step": 28800 }, { "epoch": 0.5689984446063279, "grad_norm": 2.277695417404175, "learning_rate": 9.964561404688095e-07, "loss": 2.1029, "num_input_tokens_seen": 29432944, "step": 28900 }, { "epoch": 0.5709672973558307, "grad_norm": 1.9059792757034302, "learning_rate": 9.964315938140908e-07, "loss": 1.9671, "num_input_tokens_seen": 29533832, "step": 29000 }, { "epoch": 0.5729361501053336, "grad_norm": 1.7866215705871582, "learning_rate": 9.96406962745367e-07, "loss": 1.9647, "num_input_tokens_seen": 29636232, "step": 29100 }, { "epoch": 0.5749050028548365, "grad_norm": 1.9816657304763794, "learning_rate": 9.963822472668257e-07, "loss": 1.973, "num_input_tokens_seen": 29738632, "step": 29200 }, { "epoch": 0.5768738556043393, "grad_norm": 2.256709575653076, "learning_rate": 9.963574473826702e-07, "loss": 1.9592, "num_input_tokens_seen": 29840240, "step": 29300 }, { "epoch": 0.5788427083538422, "grad_norm": 2.0529892444610596, "learning_rate": 9.963325630971172e-07, "loss": 1.9705, "num_input_tokens_seen": 29942640, "step": 29400 }, { "epoch": 0.5808115611033451, "grad_norm": 1.9970955848693848, "learning_rate": 9.96307594414398e-07, "loss": 2.0044, "num_input_tokens_seen": 30043048, "step": 29500 }, { "epoch": 0.582780413852848, "grad_norm": 2.6988253593444824, "learning_rate": 9.962825413387585e-07, "loss": 1.9797, "num_input_tokens_seen": 30145448, "step": 29600 }, { "epoch": 0.5847492666023508, "grad_norm": 1.9380062818527222, "learning_rate": 9.962574038744587e-07, "loss": 1.9933, "num_input_tokens_seen": 30247384, "step": 29700 }, { "epoch": 0.5867181193518537, "grad_norm": 1.6916238069534302, "learning_rate": 9.96232182025773e-07, "loss": 1.9392, "num_input_tokens_seen": 30349784, "step": 29800 }, { "epoch": 0.5886869721013566, "grad_norm": 1.9196768999099731, "learning_rate": 9.962068757969902e-07, "loss": 2.0357, "num_input_tokens_seen": 30452184, "step": 29900 }, { "epoch": 0.5906558248508594, "grad_norm": 1.998913049697876, "learning_rate": 9.961814851924132e-07, "loss": 1.9804, "num_input_tokens_seen": 30553720, "step": 30000 }, { "epoch": 0.5926246776003623, "grad_norm": 2.203202247619629, "learning_rate": 9.9615601021636e-07, "loss": 1.9781, "num_input_tokens_seen": 30656120, "step": 30100 }, { "epoch": 0.5945935303498652, "grad_norm": 1.7600692510604858, "learning_rate": 9.961304508731616e-07, "loss": 2.001, "num_input_tokens_seen": 30757728, "step": 30200 }, { "epoch": 0.596562383099368, "grad_norm": 1.7541885375976562, "learning_rate": 9.961048071671648e-07, "loss": 2.0252, "num_input_tokens_seen": 30860128, "step": 30300 }, { "epoch": 0.5985312358488709, "grad_norm": 2.0382676124572754, "learning_rate": 9.960790791027297e-07, "loss": 1.9911, "num_input_tokens_seen": 30960928, "step": 30400 }, { "epoch": 0.6005000885983738, "grad_norm": 3.8343398571014404, "learning_rate": 9.960532666842317e-07, "loss": 1.953, "num_input_tokens_seen": 31062480, "step": 30500 }, { "epoch": 0.6024689413478765, "grad_norm": 1.8821525573730469, "learning_rate": 9.960273699160593e-07, "loss": 1.9692, "num_input_tokens_seen": 31163536, "step": 30600 }, { "epoch": 0.6044377940973794, "grad_norm": 1.9925546646118164, "learning_rate": 9.960013888026165e-07, "loss": 2.0457, "num_input_tokens_seen": 31264296, "step": 30700 }, { "epoch": 0.6064066468468823, "grad_norm": 2.1350507736206055, "learning_rate": 9.95975323348321e-07, "loss": 1.9794, "num_input_tokens_seen": 31366200, "step": 30800 }, { "epoch": 0.6083754995963851, "grad_norm": 2.237239360809326, "learning_rate": 9.959491735576048e-07, "loss": 1.9788, "num_input_tokens_seen": 31468056, "step": 30900 }, { "epoch": 0.610344352345888, "grad_norm": 1.8590543270111084, "learning_rate": 9.95922939434915e-07, "loss": 2.0007, "num_input_tokens_seen": 31570024, "step": 31000 }, { "epoch": 0.6123132050953909, "grad_norm": 1.8755176067352295, "learning_rate": 9.958966209847119e-07, "loss": 2.0011, "num_input_tokens_seen": 31670456, "step": 31100 }, { "epoch": 0.6142820578448938, "grad_norm": 1.9030488729476929, "learning_rate": 9.95870218211471e-07, "loss": 2.0265, "num_input_tokens_seen": 31772304, "step": 31200 }, { "epoch": 0.6162509105943966, "grad_norm": 1.8745536804199219, "learning_rate": 9.95843731119682e-07, "loss": 2.0047, "num_input_tokens_seen": 31874224, "step": 31300 }, { "epoch": 0.6182197633438995, "grad_norm": 1.827952265739441, "learning_rate": 9.958171597138484e-07, "loss": 1.9865, "num_input_tokens_seen": 31975768, "step": 31400 }, { "epoch": 0.6201886160934024, "grad_norm": 2.2605669498443604, "learning_rate": 9.957905039984888e-07, "loss": 1.9703, "num_input_tokens_seen": 32077064, "step": 31500 }, { "epoch": 0.6221574688429052, "grad_norm": 1.8941019773483276, "learning_rate": 9.957637639781359e-07, "loss": 1.9939, "num_input_tokens_seen": 32178832, "step": 31600 }, { "epoch": 0.6241263215924081, "grad_norm": 1.839310646057129, "learning_rate": 9.957369396573362e-07, "loss": 1.9873, "num_input_tokens_seen": 32281232, "step": 31700 }, { "epoch": 0.626095174341911, "grad_norm": 1.820748209953308, "learning_rate": 9.957100310406511e-07, "loss": 2.0094, "num_input_tokens_seen": 32383632, "step": 31800 }, { "epoch": 0.6280640270914138, "grad_norm": 1.7682918310165405, "learning_rate": 9.956830381326565e-07, "loss": 1.9753, "num_input_tokens_seen": 32486032, "step": 31900 }, { "epoch": 0.6300328798409167, "grad_norm": 1.772078514099121, "learning_rate": 9.956559609379418e-07, "loss": 2.0039, "num_input_tokens_seen": 32588432, "step": 32000 }, { "epoch": 0.6320017325904196, "grad_norm": 1.7432925701141357, "learning_rate": 9.956287994611116e-07, "loss": 2.0084, "num_input_tokens_seen": 32688968, "step": 32100 }, { "epoch": 0.6339705853399225, "grad_norm": 2.1200695037841797, "learning_rate": 9.956015537067843e-07, "loss": 2.0633, "num_input_tokens_seen": 32790728, "step": 32200 }, { "epoch": 0.6359394380894253, "grad_norm": 1.9778767824172974, "learning_rate": 9.95574223679593e-07, "loss": 2.0204, "num_input_tokens_seen": 32892320, "step": 32300 }, { "epoch": 0.6379082908389282, "grad_norm": 1.8613898754119873, "learning_rate": 9.955468093841848e-07, "loss": 1.9224, "num_input_tokens_seen": 32994720, "step": 32400 }, { "epoch": 0.6398771435884311, "grad_norm": 1.9930576086044312, "learning_rate": 9.955193108252214e-07, "loss": 2.0055, "num_input_tokens_seen": 33095600, "step": 32500 }, { "epoch": 0.6418459963379339, "grad_norm": 1.9638782739639282, "learning_rate": 9.954917280073784e-07, "loss": 1.993, "num_input_tokens_seen": 33197152, "step": 32600 }, { "epoch": 0.6438148490874368, "grad_norm": 1.9950863122940063, "learning_rate": 9.954640609353462e-07, "loss": 1.9665, "num_input_tokens_seen": 33298512, "step": 32700 }, { "epoch": 0.6457837018369396, "grad_norm": 2.18471360206604, "learning_rate": 9.954363096138297e-07, "loss": 2.0437, "num_input_tokens_seen": 33397424, "step": 32800 }, { "epoch": 0.6477525545864424, "grad_norm": 1.8377265930175781, "learning_rate": 9.95408474047547e-07, "loss": 1.9945, "num_input_tokens_seen": 33499384, "step": 32900 }, { "epoch": 0.6497214073359453, "grad_norm": 1.9662699699401855, "learning_rate": 9.95380554241232e-07, "loss": 1.966, "num_input_tokens_seen": 33601552, "step": 33000 }, { "epoch": 0.6516902600854482, "grad_norm": 1.7178562879562378, "learning_rate": 9.95352550199632e-07, "loss": 1.9928, "num_input_tokens_seen": 33703952, "step": 33100 }, { "epoch": 0.653659112834951, "grad_norm": 1.94487464427948, "learning_rate": 9.953244619275088e-07, "loss": 2.0221, "num_input_tokens_seen": 33806352, "step": 33200 }, { "epoch": 0.6556279655844539, "grad_norm": 2.074824810028076, "learning_rate": 9.952962894296386e-07, "loss": 1.9933, "num_input_tokens_seen": 33908752, "step": 33300 }, { "epoch": 0.6575968183339568, "grad_norm": 2.118398427963257, "learning_rate": 9.952680327108122e-07, "loss": 1.9703, "num_input_tokens_seen": 34011152, "step": 33400 }, { "epoch": 0.6595656710834596, "grad_norm": 1.9463146924972534, "learning_rate": 9.952396917758336e-07, "loss": 1.9908, "num_input_tokens_seen": 34113048, "step": 33500 }, { "epoch": 0.6615345238329625, "grad_norm": 1.757574439048767, "learning_rate": 9.952112666295227e-07, "loss": 2.0082, "num_input_tokens_seen": 34215152, "step": 33600 }, { "epoch": 0.6635033765824654, "grad_norm": 1.937621831893921, "learning_rate": 9.951827572767128e-07, "loss": 1.975, "num_input_tokens_seen": 34316464, "step": 33700 }, { "epoch": 0.6654722293319683, "grad_norm": 1.856847882270813, "learning_rate": 9.951541637222513e-07, "loss": 2.0031, "num_input_tokens_seen": 34418312, "step": 33800 }, { "epoch": 0.6674410820814711, "grad_norm": 2.0302321910858154, "learning_rate": 9.95125485971001e-07, "loss": 1.9705, "num_input_tokens_seen": 34519792, "step": 33900 }, { "epoch": 0.669409934830974, "grad_norm": 1.937517762184143, "learning_rate": 9.950967240278377e-07, "loss": 1.9463, "num_input_tokens_seen": 34622192, "step": 34000 }, { "epoch": 0.6713787875804769, "grad_norm": 2.2435901165008545, "learning_rate": 9.950678778976523e-07, "loss": 1.9799, "num_input_tokens_seen": 34724592, "step": 34100 }, { "epoch": 0.6733476403299797, "grad_norm": 1.9277533292770386, "learning_rate": 9.950389475853499e-07, "loss": 1.9848, "num_input_tokens_seen": 34826992, "step": 34200 }, { "epoch": 0.6753164930794826, "grad_norm": 2.025343418121338, "learning_rate": 9.950099330958495e-07, "loss": 1.9814, "num_input_tokens_seen": 34929392, "step": 34300 }, { "epoch": 0.6772853458289855, "grad_norm": 2.034280300140381, "learning_rate": 9.949808344340855e-07, "loss": 2.0129, "num_input_tokens_seen": 35031448, "step": 34400 }, { "epoch": 0.6792541985784883, "grad_norm": 1.7865476608276367, "learning_rate": 9.949516516050052e-07, "loss": 1.9997, "num_input_tokens_seen": 35133128, "step": 34500 }, { "epoch": 0.6812230513279912, "grad_norm": 2.143674373626709, "learning_rate": 9.949223846135713e-07, "loss": 1.9308, "num_input_tokens_seen": 35234960, "step": 34600 }, { "epoch": 0.6831919040774941, "grad_norm": 2.0520286560058594, "learning_rate": 9.948930334647602e-07, "loss": 2.0636, "num_input_tokens_seen": 35334576, "step": 34700 }, { "epoch": 0.685160756826997, "grad_norm": 1.8532273769378662, "learning_rate": 9.94863598163563e-07, "loss": 1.9685, "num_input_tokens_seen": 35436176, "step": 34800 }, { "epoch": 0.6871296095764998, "grad_norm": 2.1574697494506836, "learning_rate": 9.948340787149847e-07, "loss": 1.9843, "num_input_tokens_seen": 35538576, "step": 34900 }, { "epoch": 0.6890984623260027, "grad_norm": 1.7824547290802002, "learning_rate": 9.94804475124045e-07, "loss": 1.9959, "num_input_tokens_seen": 35640288, "step": 35000 }, { "epoch": 0.6910673150755055, "grad_norm": 1.9223896265029907, "learning_rate": 9.947747873957775e-07, "loss": 1.988, "num_input_tokens_seen": 35741992, "step": 35100 }, { "epoch": 0.6930361678250083, "grad_norm": 2.1041834354400635, "learning_rate": 9.94745015535231e-07, "loss": 1.9787, "num_input_tokens_seen": 35844392, "step": 35200 }, { "epoch": 0.6950050205745112, "grad_norm": 1.9088863134384155, "learning_rate": 9.947151595474668e-07, "loss": 2.0372, "num_input_tokens_seen": 35946208, "step": 35300 }, { "epoch": 0.6969738733240141, "grad_norm": 1.6192907094955444, "learning_rate": 9.946852194375629e-07, "loss": 1.9878, "num_input_tokens_seen": 36048040, "step": 35400 }, { "epoch": 0.6989427260735169, "grad_norm": 2.0870442390441895, "learning_rate": 9.946551952106094e-07, "loss": 2.0269, "num_input_tokens_seen": 36150440, "step": 35500 }, { "epoch": 0.7009115788230198, "grad_norm": 2.09017014503479, "learning_rate": 9.946250868717122e-07, "loss": 2.0078, "num_input_tokens_seen": 36251920, "step": 35600 }, { "epoch": 0.7028804315725227, "grad_norm": 2.113693952560425, "learning_rate": 9.94594894425991e-07, "loss": 1.9861, "num_input_tokens_seen": 36354320, "step": 35700 }, { "epoch": 0.7048492843220255, "grad_norm": 1.7313716411590576, "learning_rate": 9.945646178785797e-07, "loss": 1.9537, "num_input_tokens_seen": 36456720, "step": 35800 }, { "epoch": 0.7068181370715284, "grad_norm": 2.220529556274414, "learning_rate": 9.94534257234626e-07, "loss": 2.0293, "num_input_tokens_seen": 36557424, "step": 35900 }, { "epoch": 0.7087869898210313, "grad_norm": 2.143380880355835, "learning_rate": 9.945038124992933e-07, "loss": 1.9949, "num_input_tokens_seen": 36658560, "step": 36000 }, { "epoch": 0.7107558425705341, "grad_norm": 1.8441015481948853, "learning_rate": 9.94473283677758e-07, "loss": 1.9858, "num_input_tokens_seen": 36760960, "step": 36100 }, { "epoch": 0.712724695320037, "grad_norm": 2.3321261405944824, "learning_rate": 9.944426707752117e-07, "loss": 2.0326, "num_input_tokens_seen": 36861200, "step": 36200 }, { "epoch": 0.7146935480695399, "grad_norm": 1.9875354766845703, "learning_rate": 9.944119737968592e-07, "loss": 1.9999, "num_input_tokens_seen": 36963600, "step": 36300 }, { "epoch": 0.7166624008190428, "grad_norm": 2.2651355266571045, "learning_rate": 9.943811927479207e-07, "loss": 1.9843, "num_input_tokens_seen": 37066000, "step": 36400 }, { "epoch": 0.7186312535685456, "grad_norm": 2.0598232746124268, "learning_rate": 9.9435032763363e-07, "loss": 1.9995, "num_input_tokens_seen": 37168400, "step": 36500 }, { "epoch": 0.7206001063180485, "grad_norm": 3.320465087890625, "learning_rate": 9.94319378459236e-07, "loss": 1.9962, "num_input_tokens_seen": 37269864, "step": 36600 }, { "epoch": 0.7225689590675514, "grad_norm": 2.06569766998291, "learning_rate": 9.94288345230001e-07, "loss": 1.9829, "num_input_tokens_seen": 37371496, "step": 36700 }, { "epoch": 0.7245378118170542, "grad_norm": 1.7494834661483765, "learning_rate": 9.942572279512014e-07, "loss": 2.007, "num_input_tokens_seen": 37473080, "step": 36800 }, { "epoch": 0.7265066645665571, "grad_norm": 1.762199878692627, "learning_rate": 9.942260266281295e-07, "loss": 2.008, "num_input_tokens_seen": 37574736, "step": 36900 }, { "epoch": 0.72847551731606, "grad_norm": 2.2232725620269775, "learning_rate": 9.9419474126609e-07, "loss": 1.9557, "num_input_tokens_seen": 37676088, "step": 37000 }, { "epoch": 0.7304443700655628, "grad_norm": 1.9961750507354736, "learning_rate": 9.94163371870403e-07, "loss": 2.0327, "num_input_tokens_seen": 37775664, "step": 37100 }, { "epoch": 0.7324132228150657, "grad_norm": 1.8107763528823853, "learning_rate": 9.941319184464026e-07, "loss": 1.9494, "num_input_tokens_seen": 37878064, "step": 37200 }, { "epoch": 0.7343820755645685, "grad_norm": 2.0820484161376953, "learning_rate": 9.94100380999437e-07, "loss": 1.9297, "num_input_tokens_seen": 37980464, "step": 37300 }, { "epoch": 0.7363509283140713, "grad_norm": 1.7529029846191406, "learning_rate": 9.940687595348693e-07, "loss": 2.0081, "num_input_tokens_seen": 38082072, "step": 37400 }, { "epoch": 0.7383197810635742, "grad_norm": 1.797738790512085, "learning_rate": 9.94037054058076e-07, "loss": 1.9588, "num_input_tokens_seen": 38184472, "step": 37500 }, { "epoch": 0.7402886338130771, "grad_norm": 1.732525110244751, "learning_rate": 9.940052645744488e-07, "loss": 1.9768, "num_input_tokens_seen": 38286736, "step": 37600 }, { "epoch": 0.74225748656258, "grad_norm": 1.833274006843567, "learning_rate": 9.939733910893928e-07, "loss": 2.005, "num_input_tokens_seen": 38388616, "step": 37700 }, { "epoch": 0.7442263393120828, "grad_norm": 1.9808897972106934, "learning_rate": 9.93941433608328e-07, "loss": 1.9749, "num_input_tokens_seen": 38490312, "step": 37800 }, { "epoch": 0.7461951920615857, "grad_norm": 1.959140658378601, "learning_rate": 9.939093921366888e-07, "loss": 1.9948, "num_input_tokens_seen": 38591968, "step": 37900 }, { "epoch": 0.7481640448110886, "grad_norm": 1.8730968236923218, "learning_rate": 9.93877266679923e-07, "loss": 1.9451, "num_input_tokens_seen": 38693520, "step": 38000 }, { "epoch": 0.7501328975605914, "grad_norm": 1.9512994289398193, "learning_rate": 9.938450572434936e-07, "loss": 2.0305, "num_input_tokens_seen": 38792232, "step": 38100 }, { "epoch": 0.7521017503100943, "grad_norm": 2.2991583347320557, "learning_rate": 9.938127638328775e-07, "loss": 1.9864, "num_input_tokens_seen": 38893096, "step": 38200 }, { "epoch": 0.7540706030595972, "grad_norm": 1.5798085927963257, "learning_rate": 9.93780386453566e-07, "loss": 1.9588, "num_input_tokens_seen": 38995496, "step": 38300 }, { "epoch": 0.7560394558091, "grad_norm": 2.174484968185425, "learning_rate": 9.937479251110642e-07, "loss": 1.9728, "num_input_tokens_seen": 39097896, "step": 38400 }, { "epoch": 0.7580083085586029, "grad_norm": 6.0786452293396, "learning_rate": 9.937153798108927e-07, "loss": 1.9651, "num_input_tokens_seen": 39200296, "step": 38500 }, { "epoch": 0.7599771613081058, "grad_norm": 2.087667465209961, "learning_rate": 9.936827505585848e-07, "loss": 2.0322, "num_input_tokens_seen": 39302696, "step": 38600 }, { "epoch": 0.7619460140576086, "grad_norm": 1.6887339353561401, "learning_rate": 9.93650037359689e-07, "loss": 1.9569, "num_input_tokens_seen": 39405096, "step": 38700 }, { "epoch": 0.7639148668071115, "grad_norm": 1.8706046342849731, "learning_rate": 9.936172402197682e-07, "loss": 1.9536, "num_input_tokens_seen": 39507496, "step": 38800 }, { "epoch": 0.7658837195566144, "grad_norm": 1.905015230178833, "learning_rate": 9.935843591443988e-07, "loss": 1.9576, "num_input_tokens_seen": 39608984, "step": 38900 }, { "epoch": 0.7678525723061173, "grad_norm": 1.635048508644104, "learning_rate": 9.935513941391724e-07, "loss": 1.983, "num_input_tokens_seen": 39711384, "step": 39000 }, { "epoch": 0.7698214250556201, "grad_norm": 2.011270761489868, "learning_rate": 9.935183452096943e-07, "loss": 1.9447, "num_input_tokens_seen": 39813312, "step": 39100 }, { "epoch": 0.771790277805123, "grad_norm": 2.384460210800171, "learning_rate": 9.934852123615838e-07, "loss": 1.9493, "num_input_tokens_seen": 39915712, "step": 39200 }, { "epoch": 0.7737591305546259, "grad_norm": 2.0290377140045166, "learning_rate": 9.934519956004753e-07, "loss": 2.0189, "num_input_tokens_seen": 40017192, "step": 39300 }, { "epoch": 0.7757279833041287, "grad_norm": 2.2202324867248535, "learning_rate": 9.934186949320172e-07, "loss": 1.9848, "num_input_tokens_seen": 40118776, "step": 39400 }, { "epoch": 0.7776968360536316, "grad_norm": 2.19101881980896, "learning_rate": 9.933853103618715e-07, "loss": 2.0028, "num_input_tokens_seen": 40220000, "step": 39500 }, { "epoch": 0.7796656888031344, "grad_norm": 2.1909310817718506, "learning_rate": 9.93351841895715e-07, "loss": 1.9879, "num_input_tokens_seen": 40322400, "step": 39600 }, { "epoch": 0.7816345415526372, "grad_norm": 1.8135439157485962, "learning_rate": 9.933182895392392e-07, "loss": 1.9514, "num_input_tokens_seen": 40424096, "step": 39700 }, { "epoch": 0.7836033943021401, "grad_norm": 2.112919569015503, "learning_rate": 9.93284653298149e-07, "loss": 2.0307, "num_input_tokens_seen": 40526264, "step": 39800 }, { "epoch": 0.785572247051643, "grad_norm": 1.9405063390731812, "learning_rate": 9.932509331781641e-07, "loss": 1.9797, "num_input_tokens_seen": 40628664, "step": 39900 }, { "epoch": 0.7875410998011458, "grad_norm": 1.945940375328064, "learning_rate": 9.93217129185018e-07, "loss": 2.0123, "num_input_tokens_seen": 40731064, "step": 40000 }, { "epoch": 0.7895099525506487, "grad_norm": 1.8466817140579224, "learning_rate": 9.931832413244595e-07, "loss": 1.9777, "num_input_tokens_seen": 40831424, "step": 40100 }, { "epoch": 0.7914788053001516, "grad_norm": 2.042346715927124, "learning_rate": 9.931492696022503e-07, "loss": 1.9964, "num_input_tokens_seen": 40933824, "step": 40200 }, { "epoch": 0.7934476580496544, "grad_norm": 1.955869436264038, "learning_rate": 9.931152140241673e-07, "loss": 1.9658, "num_input_tokens_seen": 41035632, "step": 40300 }, { "epoch": 0.7954165107991573, "grad_norm": 1.9215490818023682, "learning_rate": 9.930810745960012e-07, "loss": 1.9834, "num_input_tokens_seen": 41138032, "step": 40400 }, { "epoch": 0.7973853635486602, "grad_norm": 1.7350525856018066, "learning_rate": 9.930468513235573e-07, "loss": 2.0175, "num_input_tokens_seen": 41239848, "step": 40500 }, { "epoch": 0.7993542162981631, "grad_norm": 2.2450482845306396, "learning_rate": 9.930125442126546e-07, "loss": 1.9772, "num_input_tokens_seen": 41341712, "step": 40600 }, { "epoch": 0.8013230690476659, "grad_norm": 1.9311119318008423, "learning_rate": 9.929781532691275e-07, "loss": 1.9549, "num_input_tokens_seen": 41442864, "step": 40700 }, { "epoch": 0.8032919217971688, "grad_norm": 1.9055299758911133, "learning_rate": 9.929436784988229e-07, "loss": 2.0071, "num_input_tokens_seen": 41544624, "step": 40800 }, { "epoch": 0.8052607745466717, "grad_norm": 1.8988803625106812, "learning_rate": 9.929091199076038e-07, "loss": 2.0175, "num_input_tokens_seen": 41645312, "step": 40900 }, { "epoch": 0.8072296272961745, "grad_norm": 2.0395917892456055, "learning_rate": 9.928744775013462e-07, "loss": 1.9737, "num_input_tokens_seen": 41745664, "step": 41000 }, { "epoch": 0.8091984800456774, "grad_norm": 1.9754116535186768, "learning_rate": 9.928397512859405e-07, "loss": 1.9739, "num_input_tokens_seen": 41847232, "step": 41100 }, { "epoch": 0.8111673327951803, "grad_norm": 2.7534637451171875, "learning_rate": 9.928049412672923e-07, "loss": 1.969, "num_input_tokens_seen": 41949632, "step": 41200 }, { "epoch": 0.8131361855446831, "grad_norm": 1.875912070274353, "learning_rate": 9.9277004745132e-07, "loss": 1.9624, "num_input_tokens_seen": 42052032, "step": 41300 }, { "epoch": 0.815105038294186, "grad_norm": 1.896838903427124, "learning_rate": 9.927350698439577e-07, "loss": 1.9597, "num_input_tokens_seen": 42154432, "step": 41400 }, { "epoch": 0.8170738910436889, "grad_norm": 1.8952997922897339, "learning_rate": 9.927000084511525e-07, "loss": 2.0168, "num_input_tokens_seen": 42255664, "step": 41500 }, { "epoch": 0.8190427437931918, "grad_norm": 1.8672600984573364, "learning_rate": 9.926648632788663e-07, "loss": 1.9418, "num_input_tokens_seen": 42357248, "step": 41600 }, { "epoch": 0.8210115965426946, "grad_norm": 2.009817123413086, "learning_rate": 9.926296343330758e-07, "loss": 1.9218, "num_input_tokens_seen": 42459648, "step": 41700 }, { "epoch": 0.8229804492921975, "grad_norm": 1.8217511177062988, "learning_rate": 9.925943216197707e-07, "loss": 2.0096, "num_input_tokens_seen": 42561272, "step": 41800 }, { "epoch": 0.8249493020417002, "grad_norm": 1.804321050643921, "learning_rate": 9.925589251449561e-07, "loss": 2.0198, "num_input_tokens_seen": 42663672, "step": 41900 }, { "epoch": 0.8269181547912031, "grad_norm": 1.753313422203064, "learning_rate": 9.925234449146507e-07, "loss": 1.9451, "num_input_tokens_seen": 42764968, "step": 42000 }, { "epoch": 0.828887007540706, "grad_norm": 1.8761632442474365, "learning_rate": 9.924878809348875e-07, "loss": 2.0011, "num_input_tokens_seen": 42866600, "step": 42100 }, { "epoch": 0.8308558602902089, "grad_norm": 1.9971232414245605, "learning_rate": 9.924522332117143e-07, "loss": 1.9953, "num_input_tokens_seen": 42968352, "step": 42200 }, { "epoch": 0.8328247130397117, "grad_norm": 1.9807604551315308, "learning_rate": 9.92416501751192e-07, "loss": 1.964, "num_input_tokens_seen": 43070752, "step": 42300 }, { "epoch": 0.8347935657892146, "grad_norm": 1.793778419494629, "learning_rate": 9.92380686559397e-07, "loss": 1.9883, "num_input_tokens_seen": 43173152, "step": 42400 }, { "epoch": 0.8367624185387175, "grad_norm": 1.7874850034713745, "learning_rate": 9.923447876424192e-07, "loss": 1.928, "num_input_tokens_seen": 43274384, "step": 42500 }, { "epoch": 0.8387312712882203, "grad_norm": 1.9487388134002686, "learning_rate": 9.923088050063628e-07, "loss": 1.9831, "num_input_tokens_seen": 43376784, "step": 42600 }, { "epoch": 0.8407001240377232, "grad_norm": 3.431633234024048, "learning_rate": 9.922727386573465e-07, "loss": 1.9724, "num_input_tokens_seen": 43479184, "step": 42700 }, { "epoch": 0.8426689767872261, "grad_norm": 1.9606448411941528, "learning_rate": 9.92236588601503e-07, "loss": 1.9748, "num_input_tokens_seen": 43580352, "step": 42800 }, { "epoch": 0.844637829536729, "grad_norm": 2.18243145942688, "learning_rate": 9.922003548449793e-07, "loss": 2.0376, "num_input_tokens_seen": 43680896, "step": 42900 }, { "epoch": 0.8466066822862318, "grad_norm": 1.7839654684066772, "learning_rate": 9.921640373939367e-07, "loss": 1.9521, "num_input_tokens_seen": 43783296, "step": 43000 }, { "epoch": 0.8485755350357347, "grad_norm": 1.9653984308242798, "learning_rate": 9.921276362545506e-07, "loss": 1.9842, "num_input_tokens_seen": 43884520, "step": 43100 }, { "epoch": 0.8505443877852376, "grad_norm": 6.457769870758057, "learning_rate": 9.920911514330108e-07, "loss": 1.9927, "num_input_tokens_seen": 43986920, "step": 43200 }, { "epoch": 0.8525132405347404, "grad_norm": 2.368502616882324, "learning_rate": 9.920545829355212e-07, "loss": 1.9525, "num_input_tokens_seen": 44089320, "step": 43300 }, { "epoch": 0.8544820932842433, "grad_norm": 1.9824861288070679, "learning_rate": 9.920179307682998e-07, "loss": 2.0064, "num_input_tokens_seen": 44190960, "step": 43400 }, { "epoch": 0.8564509460337462, "grad_norm": 1.8569347858428955, "learning_rate": 9.919811949375795e-07, "loss": 2.0172, "num_input_tokens_seen": 44293360, "step": 43500 }, { "epoch": 0.858419798783249, "grad_norm": 1.8042532205581665, "learning_rate": 9.919443754496063e-07, "loss": 2.0037, "num_input_tokens_seen": 44395216, "step": 43600 }, { "epoch": 0.8603886515327519, "grad_norm": 2.066889762878418, "learning_rate": 9.91907472310641e-07, "loss": 1.9806, "num_input_tokens_seen": 44496800, "step": 43700 }, { "epoch": 0.8623575042822548, "grad_norm": 2.8663549423217773, "learning_rate": 9.918704855269595e-07, "loss": 2.0107, "num_input_tokens_seen": 44598504, "step": 43800 }, { "epoch": 0.8643263570317576, "grad_norm": 1.7114691734313965, "learning_rate": 9.918334151048504e-07, "loss": 1.9491, "num_input_tokens_seen": 44700320, "step": 43900 }, { "epoch": 0.8662952097812605, "grad_norm": 1.7910566329956055, "learning_rate": 9.917962610506173e-07, "loss": 1.9544, "num_input_tokens_seen": 44802720, "step": 44000 }, { "epoch": 0.8682640625307633, "grad_norm": 2.052602767944336, "learning_rate": 9.917590233705779e-07, "loss": 2.0316, "num_input_tokens_seen": 44904528, "step": 44100 }, { "epoch": 0.8702329152802661, "grad_norm": 1.9590590000152588, "learning_rate": 9.917217020710645e-07, "loss": 1.9944, "num_input_tokens_seen": 45005728, "step": 44200 }, { "epoch": 0.872201768029769, "grad_norm": 1.757790207862854, "learning_rate": 9.91684297158423e-07, "loss": 2.0238, "num_input_tokens_seen": 45108128, "step": 44300 }, { "epoch": 0.8741706207792719, "grad_norm": 1.8873578310012817, "learning_rate": 9.916468086390137e-07, "loss": 2.0045, "num_input_tokens_seen": 45209224, "step": 44400 }, { "epoch": 0.8761394735287747, "grad_norm": 1.9909436702728271, "learning_rate": 9.916092365192115e-07, "loss": 1.9804, "num_input_tokens_seen": 45311072, "step": 44500 }, { "epoch": 0.8781083262782776, "grad_norm": 2.1602702140808105, "learning_rate": 9.915715808054049e-07, "loss": 2.0682, "num_input_tokens_seen": 45410968, "step": 44600 }, { "epoch": 0.8800771790277805, "grad_norm": 1.7149327993392944, "learning_rate": 9.91533841503997e-07, "loss": 1.9653, "num_input_tokens_seen": 45513368, "step": 44700 }, { "epoch": 0.8820460317772834, "grad_norm": 2.3716652393341064, "learning_rate": 9.914960186214055e-07, "loss": 1.9645, "num_input_tokens_seen": 45615768, "step": 44800 }, { "epoch": 0.8840148845267862, "grad_norm": 2.006587266921997, "learning_rate": 9.914581121640612e-07, "loss": 1.9974, "num_input_tokens_seen": 45717544, "step": 44900 }, { "epoch": 0.8859837372762891, "grad_norm": 1.9556728601455688, "learning_rate": 9.914201221384101e-07, "loss": 1.9571, "num_input_tokens_seen": 45819128, "step": 45000 }, { "epoch": 0.887952590025792, "grad_norm": 2.1543567180633545, "learning_rate": 9.913820485509122e-07, "loss": 1.9756, "num_input_tokens_seen": 45920952, "step": 45100 }, { "epoch": 0.8899214427752948, "grad_norm": 1.9320228099822998, "learning_rate": 9.913438914080414e-07, "loss": 1.9643, "num_input_tokens_seen": 46023352, "step": 45200 }, { "epoch": 0.8918902955247977, "grad_norm": 1.7192232608795166, "learning_rate": 9.91305650716286e-07, "loss": 2.0042, "num_input_tokens_seen": 46124944, "step": 45300 }, { "epoch": 0.8938591482743006, "grad_norm": 1.710333228111267, "learning_rate": 9.912673264821485e-07, "loss": 2.0368, "num_input_tokens_seen": 46226720, "step": 45400 }, { "epoch": 0.8958280010238034, "grad_norm": 1.861194133758545, "learning_rate": 9.91228918712146e-07, "loss": 1.9908, "num_input_tokens_seen": 46329120, "step": 45500 }, { "epoch": 0.8977968537733063, "grad_norm": 2.087844133377075, "learning_rate": 9.911904274128087e-07, "loss": 2.0278, "num_input_tokens_seen": 46430336, "step": 45600 }, { "epoch": 0.8997657065228092, "grad_norm": 1.9030890464782715, "learning_rate": 9.911518525906822e-07, "loss": 1.9771, "num_input_tokens_seen": 46532080, "step": 45700 }, { "epoch": 0.901734559272312, "grad_norm": 1.9622066020965576, "learning_rate": 9.911131942523257e-07, "loss": 1.976, "num_input_tokens_seen": 46632704, "step": 45800 }, { "epoch": 0.9037034120218149, "grad_norm": 1.8562543392181396, "learning_rate": 9.91074452404313e-07, "loss": 2.0071, "num_input_tokens_seen": 46734312, "step": 45900 }, { "epoch": 0.9056722647713178, "grad_norm": 1.9798778295516968, "learning_rate": 9.910356270532314e-07, "loss": 1.9931, "num_input_tokens_seen": 46835240, "step": 46000 }, { "epoch": 0.9076411175208207, "grad_norm": 2.1422736644744873, "learning_rate": 9.90996718205683e-07, "loss": 1.9681, "num_input_tokens_seen": 46937640, "step": 46100 }, { "epoch": 0.9096099702703235, "grad_norm": 2.19767427444458, "learning_rate": 9.90957725868284e-07, "loss": 1.9566, "num_input_tokens_seen": 47040040, "step": 46200 }, { "epoch": 0.9115788230198264, "grad_norm": 2.0844364166259766, "learning_rate": 9.909186500476645e-07, "loss": 1.9897, "num_input_tokens_seen": 47141048, "step": 46300 }, { "epoch": 0.9135476757693292, "grad_norm": 1.9962643384933472, "learning_rate": 9.908794907504693e-07, "loss": 1.9685, "num_input_tokens_seen": 47243448, "step": 46400 }, { "epoch": 0.915516528518832, "grad_norm": 1.8777378797531128, "learning_rate": 9.908402479833569e-07, "loss": 1.9752, "num_input_tokens_seen": 47345848, "step": 46500 }, { "epoch": 0.9174853812683349, "grad_norm": 2.2894251346588135, "learning_rate": 9.908009217530004e-07, "loss": 1.9997, "num_input_tokens_seen": 47447496, "step": 46600 }, { "epoch": 0.9194542340178378, "grad_norm": 1.9015779495239258, "learning_rate": 9.907615120660866e-07, "loss": 1.9873, "num_input_tokens_seen": 47549320, "step": 46700 }, { "epoch": 0.9214230867673406, "grad_norm": 1.8924967050552368, "learning_rate": 9.90722018929317e-07, "loss": 2.0095, "num_input_tokens_seen": 47649912, "step": 46800 }, { "epoch": 0.9233919395168435, "grad_norm": 2.0390703678131104, "learning_rate": 9.90682442349407e-07, "loss": 1.9489, "num_input_tokens_seen": 47752312, "step": 46900 }, { "epoch": 0.9253607922663464, "grad_norm": 1.8609728813171387, "learning_rate": 9.906427823330862e-07, "loss": 2.0041, "num_input_tokens_seen": 47854272, "step": 47000 }, { "epoch": 0.9273296450158492, "grad_norm": 1.909618616104126, "learning_rate": 9.906030388870988e-07, "loss": 1.9714, "num_input_tokens_seen": 47955944, "step": 47100 }, { "epoch": 0.9292984977653521, "grad_norm": 1.7780113220214844, "learning_rate": 9.905632120182024e-07, "loss": 1.9818, "num_input_tokens_seen": 48057888, "step": 47200 }, { "epoch": 0.931267350514855, "grad_norm": 1.7854924201965332, "learning_rate": 9.905233017331695e-07, "loss": 1.9912, "num_input_tokens_seen": 48158632, "step": 47300 }, { "epoch": 0.9332362032643579, "grad_norm": 3.2184438705444336, "learning_rate": 9.904833080387862e-07, "loss": 2.0233, "num_input_tokens_seen": 48261032, "step": 47400 }, { "epoch": 0.9352050560138607, "grad_norm": 1.8930283784866333, "learning_rate": 9.904432309418534e-07, "loss": 1.99, "num_input_tokens_seen": 48363432, "step": 47500 }, { "epoch": 0.9371739087633636, "grad_norm": 2.1618216037750244, "learning_rate": 9.90403070449186e-07, "loss": 1.9754, "num_input_tokens_seen": 48465832, "step": 47600 }, { "epoch": 0.9391427615128665, "grad_norm": 1.9905320405960083, "learning_rate": 9.903628265676124e-07, "loss": 1.9668, "num_input_tokens_seen": 48566944, "step": 47700 }, { "epoch": 0.9411116142623693, "grad_norm": 2.0114645957946777, "learning_rate": 9.90322499303976e-07, "loss": 1.9741, "num_input_tokens_seen": 48668720, "step": 47800 }, { "epoch": 0.9430804670118722, "grad_norm": 1.9229475259780884, "learning_rate": 9.902820886651344e-07, "loss": 1.9926, "num_input_tokens_seen": 48770432, "step": 47900 }, { "epoch": 0.9450493197613751, "grad_norm": 2.2934682369232178, "learning_rate": 9.902415946579589e-07, "loss": 1.9601, "num_input_tokens_seen": 48872832, "step": 48000 }, { "epoch": 0.9470181725108779, "grad_norm": 1.8845819234848022, "learning_rate": 9.902010172893349e-07, "loss": 1.9797, "num_input_tokens_seen": 48975232, "step": 48100 }, { "epoch": 0.9489870252603808, "grad_norm": 1.9820157289505005, "learning_rate": 9.901603565661626e-07, "loss": 2.003, "num_input_tokens_seen": 49077320, "step": 48200 }, { "epoch": 0.9509558780098837, "grad_norm": 1.7189642190933228, "learning_rate": 9.901196124953557e-07, "loss": 1.9997, "num_input_tokens_seen": 49178888, "step": 48300 }, { "epoch": 0.9529247307593866, "grad_norm": 1.933113694190979, "learning_rate": 9.900787850838428e-07, "loss": 1.9996, "num_input_tokens_seen": 49281288, "step": 48400 }, { "epoch": 0.9548935835088894, "grad_norm": 1.7269684076309204, "learning_rate": 9.900378743385659e-07, "loss": 2.0065, "num_input_tokens_seen": 49383168, "step": 48500 }, { "epoch": 0.9568624362583922, "grad_norm": 1.9899290800094604, "learning_rate": 9.899968802664816e-07, "loss": 1.9805, "num_input_tokens_seen": 49484008, "step": 48600 }, { "epoch": 0.958831289007895, "grad_norm": 1.9168274402618408, "learning_rate": 9.899558028745607e-07, "loss": 1.9843, "num_input_tokens_seen": 49585552, "step": 48700 }, { "epoch": 0.9608001417573979, "grad_norm": 1.8895981311798096, "learning_rate": 9.89914642169788e-07, "loss": 2.0373, "num_input_tokens_seen": 49687952, "step": 48800 }, { "epoch": 0.9627689945069008, "grad_norm": 1.8340901136398315, "learning_rate": 9.898733981591625e-07, "loss": 1.9435, "num_input_tokens_seen": 49790352, "step": 48900 }, { "epoch": 0.9647378472564037, "grad_norm": 1.5985980033874512, "learning_rate": 9.898320708496974e-07, "loss": 1.9343, "num_input_tokens_seen": 49891936, "step": 49000 }, { "epoch": 0.9667067000059065, "grad_norm": 1.835910677909851, "learning_rate": 9.8979066024842e-07, "loss": 2.025, "num_input_tokens_seen": 49994336, "step": 49100 }, { "epoch": 0.9686755527554094, "grad_norm": 1.802293062210083, "learning_rate": 9.89749166362372e-07, "loss": 1.9972, "num_input_tokens_seen": 50096256, "step": 49200 }, { "epoch": 0.9706444055049123, "grad_norm": 3.131337881088257, "learning_rate": 9.89707589198609e-07, "loss": 1.9958, "num_input_tokens_seen": 50198656, "step": 49300 }, { "epoch": 0.9726132582544151, "grad_norm": 1.742756962776184, "learning_rate": 9.896659287642008e-07, "loss": 1.9598, "num_input_tokens_seen": 50301056, "step": 49400 }, { "epoch": 0.974582111003918, "grad_norm": 2.2476494312286377, "learning_rate": 9.896241850662316e-07, "loss": 1.9685, "num_input_tokens_seen": 50402888, "step": 49500 }, { "epoch": 0.9765509637534209, "grad_norm": 1.8031234741210938, "learning_rate": 9.895823581117992e-07, "loss": 1.9915, "num_input_tokens_seen": 50505288, "step": 49600 }, { "epoch": 0.9785198165029237, "grad_norm": 1.8810124397277832, "learning_rate": 9.895404479080162e-07, "loss": 2.0079, "num_input_tokens_seen": 50607688, "step": 49700 }, { "epoch": 0.9804886692524266, "grad_norm": 2.075939416885376, "learning_rate": 9.894984544620092e-07, "loss": 2.0239, "num_input_tokens_seen": 50709168, "step": 49800 }, { "epoch": 0.9824575220019295, "grad_norm": 1.7710177898406982, "learning_rate": 9.894563777809185e-07, "loss": 2.003, "num_input_tokens_seen": 50809968, "step": 49900 }, { "epoch": 0.9844263747514324, "grad_norm": 2.261514186859131, "learning_rate": 9.894142178718989e-07, "loss": 1.9824, "num_input_tokens_seen": 50911664, "step": 50000 }, { "epoch": 0.9863952275009352, "grad_norm": 2.0110349655151367, "learning_rate": 9.893719747421196e-07, "loss": 1.9648, "num_input_tokens_seen": 51013200, "step": 50100 }, { "epoch": 0.9883640802504381, "grad_norm": 1.8570513725280762, "learning_rate": 9.893296483987636e-07, "loss": 1.9823, "num_input_tokens_seen": 51115104, "step": 50200 }, { "epoch": 0.990332932999941, "grad_norm": 1.7586266994476318, "learning_rate": 9.89287238849028e-07, "loss": 1.9897, "num_input_tokens_seen": 51217504, "step": 50300 }, { "epoch": 0.9923017857494438, "grad_norm": 2.0321013927459717, "learning_rate": 9.892447461001243e-07, "loss": 2.0414, "num_input_tokens_seen": 51318288, "step": 50400 }, { "epoch": 0.9942706384989467, "grad_norm": 2.01430606842041, "learning_rate": 9.892021701592778e-07, "loss": 1.9811, "num_input_tokens_seen": 51420136, "step": 50500 }, { "epoch": 0.9962394912484496, "grad_norm": 1.980430245399475, "learning_rate": 9.891595110337288e-07, "loss": 1.9351, "num_input_tokens_seen": 51522536, "step": 50600 }, { "epoch": 0.9982083439979524, "grad_norm": 7.373106956481934, "learning_rate": 9.891167687307303e-07, "loss": 2.0245, "num_input_tokens_seen": 51622928, "step": 50700 }, { "epoch": 1.0001771967474553, "grad_norm": 1.8478271961212158, "learning_rate": 9.89073943257551e-07, "loss": 1.9652, "num_input_tokens_seen": 51725328, "step": 50800 }, { "epoch": 1.002146049496958, "grad_norm": 1.8001004457473755, "learning_rate": 9.890310346214726e-07, "loss": 1.9659, "num_input_tokens_seen": 51827728, "step": 50900 }, { "epoch": 1.004114902246461, "grad_norm": 2.1868462562561035, "learning_rate": 9.889880428297912e-07, "loss": 1.9379, "num_input_tokens_seen": 51930128, "step": 51000 }, { "epoch": 1.0060837549959638, "grad_norm": 2.034771680831909, "learning_rate": 9.889449678898176e-07, "loss": 1.9795, "num_input_tokens_seen": 52032528, "step": 51100 }, { "epoch": 1.0080526077454668, "grad_norm": 2.0096757411956787, "learning_rate": 9.889018098088763e-07, "loss": 1.9782, "num_input_tokens_seen": 52133120, "step": 51200 }, { "epoch": 1.0100214604949695, "grad_norm": 2.168287754058838, "learning_rate": 9.888585685943057e-07, "loss": 1.9832, "num_input_tokens_seen": 52235000, "step": 51300 }, { "epoch": 1.0119903132444725, "grad_norm": 1.8824621438980103, "learning_rate": 9.888152442534587e-07, "loss": 1.9645, "num_input_tokens_seen": 52337400, "step": 51400 }, { "epoch": 1.0139591659939753, "grad_norm": 1.8408710956573486, "learning_rate": 9.887718367937023e-07, "loss": 2.0113, "num_input_tokens_seen": 52438976, "step": 51500 }, { "epoch": 1.0159280187434783, "grad_norm": 2.1709954738616943, "learning_rate": 9.887283462224176e-07, "loss": 1.9937, "num_input_tokens_seen": 52541376, "step": 51600 }, { "epoch": 1.017896871492981, "grad_norm": 2.017909288406372, "learning_rate": 9.886847725469999e-07, "loss": 1.9855, "num_input_tokens_seen": 52643776, "step": 51700 }, { "epoch": 1.019865724242484, "grad_norm": 2.0906968116760254, "learning_rate": 9.88641115774858e-07, "loss": 1.974, "num_input_tokens_seen": 52746176, "step": 51800 }, { "epoch": 1.0218345769919868, "grad_norm": 2.0039424896240234, "learning_rate": 9.88597375913416e-07, "loss": 1.9614, "num_input_tokens_seen": 52847936, "step": 51900 }, { "epoch": 1.0238034297414895, "grad_norm": 1.7447197437286377, "learning_rate": 9.885535529701113e-07, "loss": 1.996, "num_input_tokens_seen": 52950248, "step": 52000 }, { "epoch": 1.0257722824909925, "grad_norm": 1.8214547634124756, "learning_rate": 9.885096469523954e-07, "loss": 1.9267, "num_input_tokens_seen": 53052648, "step": 52100 }, { "epoch": 1.0277411352404953, "grad_norm": 2.217625617980957, "learning_rate": 9.884656578677344e-07, "loss": 1.958, "num_input_tokens_seen": 53155048, "step": 52200 }, { "epoch": 1.0297099879899982, "grad_norm": 1.7927933931350708, "learning_rate": 9.884215857236082e-07, "loss": 1.9878, "num_input_tokens_seen": 53256672, "step": 52300 }, { "epoch": 1.031678840739501, "grad_norm": 1.8615748882293701, "learning_rate": 9.883774305275107e-07, "loss": 2.0078, "num_input_tokens_seen": 53358264, "step": 52400 }, { "epoch": 1.033647693489004, "grad_norm": 2.009561777114868, "learning_rate": 9.883331922869505e-07, "loss": 1.9489, "num_input_tokens_seen": 53460664, "step": 52500 }, { "epoch": 1.0356165462385067, "grad_norm": 1.8303207159042358, "learning_rate": 9.882888710094498e-07, "loss": 2.0028, "num_input_tokens_seen": 53562144, "step": 52600 }, { "epoch": 1.0375853989880097, "grad_norm": 1.85591721534729, "learning_rate": 9.882444667025449e-07, "loss": 2.0363, "num_input_tokens_seen": 53664544, "step": 52700 }, { "epoch": 1.0395542517375125, "grad_norm": 1.7349711656570435, "learning_rate": 9.881999793737865e-07, "loss": 1.9919, "num_input_tokens_seen": 53766376, "step": 52800 }, { "epoch": 1.0415231044870155, "grad_norm": 1.818564772605896, "learning_rate": 9.881554090307393e-07, "loss": 1.9611, "num_input_tokens_seen": 53868776, "step": 52900 }, { "epoch": 1.0434919572365182, "grad_norm": 1.9757059812545776, "learning_rate": 9.88110755680982e-07, "loss": 1.9805, "num_input_tokens_seen": 53971176, "step": 53000 }, { "epoch": 1.0454608099860212, "grad_norm": 1.6863590478897095, "learning_rate": 9.880660193321078e-07, "loss": 1.9512, "num_input_tokens_seen": 54073424, "step": 53100 }, { "epoch": 1.047429662735524, "grad_norm": 1.8998291492462158, "learning_rate": 9.880211999917234e-07, "loss": 1.9927, "num_input_tokens_seen": 54175104, "step": 53200 }, { "epoch": 1.049398515485027, "grad_norm": 2.458225727081299, "learning_rate": 9.8797629766745e-07, "loss": 1.9625, "num_input_tokens_seen": 54276800, "step": 53300 }, { "epoch": 1.0513673682345297, "grad_norm": 1.8298643827438354, "learning_rate": 9.87931312366923e-07, "loss": 1.9786, "num_input_tokens_seen": 54378432, "step": 53400 }, { "epoch": 1.0533362209840327, "grad_norm": 2.184784412384033, "learning_rate": 9.87886244097792e-07, "loss": 1.9977, "num_input_tokens_seen": 54480152, "step": 53500 }, { "epoch": 1.0553050737335354, "grad_norm": 1.840516209602356, "learning_rate": 9.8784109286772e-07, "loss": 1.9702, "num_input_tokens_seen": 54582256, "step": 53600 }, { "epoch": 1.0572739264830384, "grad_norm": 1.9477344751358032, "learning_rate": 9.87795858684385e-07, "loss": 1.9578, "num_input_tokens_seen": 54683912, "step": 53700 }, { "epoch": 1.0592427792325412, "grad_norm": 1.9940179586410522, "learning_rate": 9.87750541555478e-07, "loss": 1.9477, "num_input_tokens_seen": 54786312, "step": 53800 }, { "epoch": 1.0612116319820442, "grad_norm": 1.9116097688674927, "learning_rate": 9.877051414887058e-07, "loss": 2.022, "num_input_tokens_seen": 54888712, "step": 53900 }, { "epoch": 1.063180484731547, "grad_norm": 2.0709564685821533, "learning_rate": 9.876596584917876e-07, "loss": 1.9789, "num_input_tokens_seen": 54991112, "step": 54000 }, { "epoch": 1.06514933748105, "grad_norm": 1.8915525674819946, "learning_rate": 9.876140925724574e-07, "loss": 2.003, "num_input_tokens_seen": 55092752, "step": 54100 }, { "epoch": 1.0671181902305527, "grad_norm": 1.904774785041809, "learning_rate": 9.875684437384637e-07, "loss": 1.966, "num_input_tokens_seen": 55194512, "step": 54200 }, { "epoch": 1.0690870429800556, "grad_norm": 1.8894705772399902, "learning_rate": 9.875227119975685e-07, "loss": 2.0077, "num_input_tokens_seen": 55296912, "step": 54300 }, { "epoch": 1.0710558957295584, "grad_norm": 1.9152696132659912, "learning_rate": 9.87476897357548e-07, "loss": 1.9686, "num_input_tokens_seen": 55398832, "step": 54400 }, { "epoch": 1.0730247484790612, "grad_norm": 2.045538902282715, "learning_rate": 9.874309998261927e-07, "loss": 2.0322, "num_input_tokens_seen": 55501232, "step": 54500 }, { "epoch": 1.0749936012285641, "grad_norm": 2.1518824100494385, "learning_rate": 9.873850194113072e-07, "loss": 2.0826, "num_input_tokens_seen": 55602496, "step": 54600 }, { "epoch": 1.076962453978067, "grad_norm": 2.073474168777466, "learning_rate": 9.873389561207097e-07, "loss": 1.9722, "num_input_tokens_seen": 55703048, "step": 54700 }, { "epoch": 1.0789313067275699, "grad_norm": 1.8059463500976562, "learning_rate": 9.872928099622334e-07, "loss": 1.9814, "num_input_tokens_seen": 55804592, "step": 54800 }, { "epoch": 1.0809001594770726, "grad_norm": 2.14548397064209, "learning_rate": 9.872465809437245e-07, "loss": 1.9392, "num_input_tokens_seen": 55906248, "step": 54900 }, { "epoch": 1.0828690122265756, "grad_norm": 1.8822154998779297, "learning_rate": 9.872002690730444e-07, "loss": 2.0209, "num_input_tokens_seen": 56007496, "step": 55000 }, { "epoch": 1.0848378649760784, "grad_norm": 1.85554039478302, "learning_rate": 9.871538743580675e-07, "loss": 1.986, "num_input_tokens_seen": 56109896, "step": 55100 }, { "epoch": 1.0868067177255814, "grad_norm": 1.8365095853805542, "learning_rate": 9.871073968066832e-07, "loss": 1.9862, "num_input_tokens_seen": 56212296, "step": 55200 }, { "epoch": 1.088775570475084, "grad_norm": 2.052393913269043, "learning_rate": 9.870608364267946e-07, "loss": 1.9599, "num_input_tokens_seen": 56314696, "step": 55300 }, { "epoch": 1.090744423224587, "grad_norm": 1.9801706075668335, "learning_rate": 9.87014193226319e-07, "loss": 2.0252, "num_input_tokens_seen": 56417096, "step": 55400 }, { "epoch": 1.0927132759740898, "grad_norm": 1.9569658041000366, "learning_rate": 9.86967467213187e-07, "loss": 1.9421, "num_input_tokens_seen": 56519496, "step": 55500 }, { "epoch": 1.0946821287235928, "grad_norm": 1.944030523300171, "learning_rate": 9.869206583953448e-07, "loss": 1.9723, "num_input_tokens_seen": 56621896, "step": 55600 }, { "epoch": 1.0966509814730956, "grad_norm": 2.0947165489196777, "learning_rate": 9.868737667807517e-07, "loss": 2.0142, "num_input_tokens_seen": 56724296, "step": 55700 }, { "epoch": 1.0986198342225986, "grad_norm": 1.771150827407837, "learning_rate": 9.868267923773807e-07, "loss": 1.9717, "num_input_tokens_seen": 56826384, "step": 55800 }, { "epoch": 1.1005886869721013, "grad_norm": 1.905934453010559, "learning_rate": 9.8677973519322e-07, "loss": 1.9431, "num_input_tokens_seen": 56928784, "step": 55900 }, { "epoch": 1.1025575397216043, "grad_norm": 2.3866124153137207, "learning_rate": 9.86732595236271e-07, "loss": 2.0366, "num_input_tokens_seen": 57031184, "step": 56000 }, { "epoch": 1.104526392471107, "grad_norm": 1.8602744340896606, "learning_rate": 9.866853725145491e-07, "loss": 2.0002, "num_input_tokens_seen": 57133096, "step": 56100 }, { "epoch": 1.10649524522061, "grad_norm": 2.094759941101074, "learning_rate": 9.866380670360848e-07, "loss": 1.9963, "num_input_tokens_seen": 57233904, "step": 56200 }, { "epoch": 1.1084640979701128, "grad_norm": 1.7452400922775269, "learning_rate": 9.865906788089218e-07, "loss": 2.0254, "num_input_tokens_seen": 57336304, "step": 56300 }, { "epoch": 1.1104329507196158, "grad_norm": 3.6265110969543457, "learning_rate": 9.865432078411178e-07, "loss": 1.9778, "num_input_tokens_seen": 57438096, "step": 56400 }, { "epoch": 1.1124018034691185, "grad_norm": 2.027275562286377, "learning_rate": 9.86495654140745e-07, "loss": 2.0114, "num_input_tokens_seen": 57539928, "step": 56500 }, { "epoch": 1.1143706562186213, "grad_norm": 1.9589576721191406, "learning_rate": 9.864480177158896e-07, "loss": 1.9805, "num_input_tokens_seen": 57641136, "step": 56600 }, { "epoch": 1.1163395089681243, "grad_norm": 2.1725780963897705, "learning_rate": 9.864002985746515e-07, "loss": 1.9595, "num_input_tokens_seen": 57743536, "step": 56700 }, { "epoch": 1.118308361717627, "grad_norm": 2.1328485012054443, "learning_rate": 9.863524967251454e-07, "loss": 2.0062, "num_input_tokens_seen": 57845104, "step": 56800 }, { "epoch": 1.12027721446713, "grad_norm": 1.820858120918274, "learning_rate": 9.86304612175499e-07, "loss": 1.9794, "num_input_tokens_seen": 57946904, "step": 56900 }, { "epoch": 1.1222460672166328, "grad_norm": 2.196723222732544, "learning_rate": 9.862566449338551e-07, "loss": 2.0205, "num_input_tokens_seen": 58048648, "step": 57000 }, { "epoch": 1.1242149199661358, "grad_norm": 2.0164146423339844, "learning_rate": 9.8620859500837e-07, "loss": 2.025, "num_input_tokens_seen": 58151048, "step": 57100 }, { "epoch": 1.1261837727156385, "grad_norm": 2.214961528778076, "learning_rate": 9.861604624072144e-07, "loss": 2.0139, "num_input_tokens_seen": 58252152, "step": 57200 }, { "epoch": 1.1281526254651415, "grad_norm": 2.091853380203247, "learning_rate": 9.861122471385725e-07, "loss": 1.9569, "num_input_tokens_seen": 58354552, "step": 57300 }, { "epoch": 1.1301214782146443, "grad_norm": 1.9607510566711426, "learning_rate": 9.86063949210643e-07, "loss": 1.986, "num_input_tokens_seen": 58456952, "step": 57400 }, { "epoch": 1.1320903309641472, "grad_norm": 2.089627265930176, "learning_rate": 9.860155686316385e-07, "loss": 2.0413, "num_input_tokens_seen": 58559352, "step": 57500 }, { "epoch": 1.13405918371365, "grad_norm": 2.056424140930176, "learning_rate": 9.85967105409786e-07, "loss": 2.0551, "num_input_tokens_seen": 58660832, "step": 57600 }, { "epoch": 1.136028036463153, "grad_norm": 2.1005749702453613, "learning_rate": 9.85918559553326e-07, "loss": 2.0195, "num_input_tokens_seen": 58763232, "step": 57700 }, { "epoch": 1.1379968892126557, "grad_norm": 2.014408826828003, "learning_rate": 9.858699310705132e-07, "loss": 2.0333, "num_input_tokens_seen": 58863984, "step": 57800 }, { "epoch": 1.1399657419621587, "grad_norm": 1.7907952070236206, "learning_rate": 9.858212199696168e-07, "loss": 1.9875, "num_input_tokens_seen": 58965472, "step": 57900 }, { "epoch": 1.1419345947116615, "grad_norm": 1.7765934467315674, "learning_rate": 9.857724262589196e-07, "loss": 1.9172, "num_input_tokens_seen": 59067872, "step": 58000 }, { "epoch": 1.1439034474611645, "grad_norm": 2.1868581771850586, "learning_rate": 9.857235499467183e-07, "loss": 1.9401, "num_input_tokens_seen": 59170272, "step": 58100 }, { "epoch": 1.1458723002106672, "grad_norm": 2.226532220840454, "learning_rate": 9.856745910413244e-07, "loss": 1.9258, "num_input_tokens_seen": 59272672, "step": 58200 }, { "epoch": 1.1478411529601702, "grad_norm": 1.9788908958435059, "learning_rate": 9.856255495510624e-07, "loss": 2.0125, "num_input_tokens_seen": 59375072, "step": 58300 }, { "epoch": 1.149810005709673, "grad_norm": 2.0415232181549072, "learning_rate": 9.855764254842717e-07, "loss": 1.9584, "num_input_tokens_seen": 59477472, "step": 58400 }, { "epoch": 1.151778858459176, "grad_norm": 1.9657243490219116, "learning_rate": 9.855272188493055e-07, "loss": 1.9672, "num_input_tokens_seen": 59579064, "step": 58500 }, { "epoch": 1.1537477112086787, "grad_norm": 2.114288091659546, "learning_rate": 9.854779296545307e-07, "loss": 1.9999, "num_input_tokens_seen": 59680728, "step": 58600 }, { "epoch": 1.1557165639581815, "grad_norm": 2.0081071853637695, "learning_rate": 9.854285579083288e-07, "loss": 2.0145, "num_input_tokens_seen": 59783128, "step": 58700 }, { "epoch": 1.1576854167076844, "grad_norm": 1.8314753770828247, "learning_rate": 9.853791036190951e-07, "loss": 1.9645, "num_input_tokens_seen": 59884232, "step": 58800 }, { "epoch": 1.1596542694571874, "grad_norm": 1.7582591772079468, "learning_rate": 9.853295667952384e-07, "loss": 1.9537, "num_input_tokens_seen": 59986632, "step": 58900 }, { "epoch": 1.1616231222066902, "grad_norm": 2.2304999828338623, "learning_rate": 9.852799474451824e-07, "loss": 2.0467, "num_input_tokens_seen": 60085184, "step": 59000 }, { "epoch": 1.163591974956193, "grad_norm": 1.8509920835494995, "learning_rate": 9.852302455773644e-07, "loss": 1.9878, "num_input_tokens_seen": 60187240, "step": 59100 }, { "epoch": 1.165560827705696, "grad_norm": 1.9994957447052002, "learning_rate": 9.851804612002357e-07, "loss": 2.027, "num_input_tokens_seen": 60289048, "step": 59200 }, { "epoch": 1.1675296804551987, "grad_norm": 1.8670504093170166, "learning_rate": 9.851305943222619e-07, "loss": 1.9659, "num_input_tokens_seen": 60391448, "step": 59300 }, { "epoch": 1.1694985332047017, "grad_norm": 1.867996335029602, "learning_rate": 9.850806449519223e-07, "loss": 1.9666, "num_input_tokens_seen": 60493848, "step": 59400 }, { "epoch": 1.1714673859542044, "grad_norm": 2.133732318878174, "learning_rate": 9.850306130977102e-07, "loss": 1.9786, "num_input_tokens_seen": 60594760, "step": 59500 }, { "epoch": 1.1734362387037074, "grad_norm": 1.793951392173767, "learning_rate": 9.849804987681334e-07, "loss": 1.9985, "num_input_tokens_seen": 60696576, "step": 59600 }, { "epoch": 1.1754050914532101, "grad_norm": 1.9974703788757324, "learning_rate": 9.849303019717134e-07, "loss": 2.0163, "num_input_tokens_seen": 60798120, "step": 59700 }, { "epoch": 1.1773739442027131, "grad_norm": 1.9553022384643555, "learning_rate": 9.848800227169855e-07, "loss": 1.9601, "num_input_tokens_seen": 60900520, "step": 59800 }, { "epoch": 1.1793427969522159, "grad_norm": 2.0108377933502197, "learning_rate": 9.848296610124995e-07, "loss": 1.9743, "num_input_tokens_seen": 61002920, "step": 59900 }, { "epoch": 1.1813116497017189, "grad_norm": 1.959549069404602, "learning_rate": 9.847792168668189e-07, "loss": 2.024, "num_input_tokens_seen": 61103056, "step": 60000 }, { "epoch": 1.1832805024512216, "grad_norm": 2.4373843669891357, "learning_rate": 9.847286902885212e-07, "loss": 1.9976, "num_input_tokens_seen": 61205456, "step": 60100 }, { "epoch": 1.1852493552007246, "grad_norm": 2.2283754348754883, "learning_rate": 9.846780812861981e-07, "loss": 1.9978, "num_input_tokens_seen": 61307856, "step": 60200 }, { "epoch": 1.1872182079502274, "grad_norm": 1.4838213920593262, "learning_rate": 9.846273898684555e-07, "loss": 2.0157, "num_input_tokens_seen": 61409440, "step": 60300 }, { "epoch": 1.1891870606997303, "grad_norm": 1.7456793785095215, "learning_rate": 9.845766160439125e-07, "loss": 2.0098, "num_input_tokens_seen": 61509456, "step": 60400 }, { "epoch": 1.191155913449233, "grad_norm": 1.8974699974060059, "learning_rate": 9.845257598212031e-07, "loss": 2.0143, "num_input_tokens_seen": 61610824, "step": 60500 }, { "epoch": 1.193124766198736, "grad_norm": 1.9952293634414673, "learning_rate": 9.844748212089748e-07, "loss": 1.9789, "num_input_tokens_seen": 61712376, "step": 60600 }, { "epoch": 1.1950936189482388, "grad_norm": 2.1045644283294678, "learning_rate": 9.844238002158896e-07, "loss": 2.0057, "num_input_tokens_seen": 61813528, "step": 60700 }, { "epoch": 1.1970624716977416, "grad_norm": 1.774985909461975, "learning_rate": 9.84372696850623e-07, "loss": 2.0387, "num_input_tokens_seen": 61915296, "step": 60800 }, { "epoch": 1.1990313244472446, "grad_norm": 2.020932912826538, "learning_rate": 9.843215111218646e-07, "loss": 1.9788, "num_input_tokens_seen": 62017384, "step": 60900 }, { "epoch": 1.2010001771967476, "grad_norm": 1.9226939678192139, "learning_rate": 9.842702430383183e-07, "loss": 1.9791, "num_input_tokens_seen": 62119056, "step": 61000 }, { "epoch": 1.2029690299462503, "grad_norm": 1.7677568197250366, "learning_rate": 9.842188926087016e-07, "loss": 1.965, "num_input_tokens_seen": 62220832, "step": 61100 }, { "epoch": 1.204937882695753, "grad_norm": 2.3421859741210938, "learning_rate": 9.841674598417462e-07, "loss": 1.9089, "num_input_tokens_seen": 62322504, "step": 61200 }, { "epoch": 1.206906735445256, "grad_norm": 1.7903199195861816, "learning_rate": 9.841159447461982e-07, "loss": 1.9467, "num_input_tokens_seen": 62423800, "step": 61300 }, { "epoch": 1.2088755881947588, "grad_norm": 4.110795974731445, "learning_rate": 9.840643473308167e-07, "loss": 1.9633, "num_input_tokens_seen": 62526200, "step": 61400 }, { "epoch": 1.2108444409442618, "grad_norm": 1.9007803201675415, "learning_rate": 9.84012667604376e-07, "loss": 1.9583, "num_input_tokens_seen": 62626904, "step": 61500 }, { "epoch": 1.2128132936937646, "grad_norm": 1.7440636157989502, "learning_rate": 9.839609055756633e-07, "loss": 1.9895, "num_input_tokens_seen": 62727760, "step": 61600 }, { "epoch": 1.2147821464432675, "grad_norm": 1.9405783414840698, "learning_rate": 9.839090612534807e-07, "loss": 2.0228, "num_input_tokens_seen": 62830160, "step": 61700 }, { "epoch": 1.2167509991927703, "grad_norm": 1.877098560333252, "learning_rate": 9.838571346466435e-07, "loss": 2.0125, "num_input_tokens_seen": 62931632, "step": 61800 }, { "epoch": 1.2187198519422733, "grad_norm": 1.8819992542266846, "learning_rate": 9.838051257639816e-07, "loss": 1.9714, "num_input_tokens_seen": 63033176, "step": 61900 }, { "epoch": 1.220688704691776, "grad_norm": 2.1795432567596436, "learning_rate": 9.83753034614339e-07, "loss": 2.0194, "num_input_tokens_seen": 63135576, "step": 62000 }, { "epoch": 1.222657557441279, "grad_norm": 1.9468737840652466, "learning_rate": 9.837008612065728e-07, "loss": 1.9567, "num_input_tokens_seen": 63237376, "step": 62100 }, { "epoch": 1.2246264101907818, "grad_norm": 1.988161563873291, "learning_rate": 9.836486055495548e-07, "loss": 1.9964, "num_input_tokens_seen": 63339776, "step": 62200 }, { "epoch": 1.2265952629402848, "grad_norm": 1.9543042182922363, "learning_rate": 9.835962676521708e-07, "loss": 1.9864, "num_input_tokens_seen": 63439912, "step": 62300 }, { "epoch": 1.2285641156897875, "grad_norm": 1.9151384830474854, "learning_rate": 9.835438475233204e-07, "loss": 2.0018, "num_input_tokens_seen": 63541744, "step": 62400 }, { "epoch": 1.2305329684392905, "grad_norm": 1.9542925357818604, "learning_rate": 9.834913451719168e-07, "loss": 2.0296, "num_input_tokens_seen": 63643704, "step": 62500 }, { "epoch": 1.2325018211887933, "grad_norm": 1.7564761638641357, "learning_rate": 9.834387606068883e-07, "loss": 1.9982, "num_input_tokens_seen": 63745576, "step": 62600 }, { "epoch": 1.2344706739382962, "grad_norm": 1.9793672561645508, "learning_rate": 9.833860938371758e-07, "loss": 1.9754, "num_input_tokens_seen": 63847888, "step": 62700 }, { "epoch": 1.236439526687799, "grad_norm": 1.9184461832046509, "learning_rate": 9.833333448717354e-07, "loss": 2.0246, "num_input_tokens_seen": 63949704, "step": 62800 }, { "epoch": 1.238408379437302, "grad_norm": 1.7936235666275024, "learning_rate": 9.832805137195362e-07, "loss": 1.9625, "num_input_tokens_seen": 64051544, "step": 62900 }, { "epoch": 1.2403772321868047, "grad_norm": 1.685490369796753, "learning_rate": 9.832276003895616e-07, "loss": 1.9919, "num_input_tokens_seen": 64153944, "step": 63000 }, { "epoch": 1.2423460849363077, "grad_norm": 1.8245015144348145, "learning_rate": 9.831746048908095e-07, "loss": 1.9627, "num_input_tokens_seen": 64255584, "step": 63100 }, { "epoch": 1.2443149376858105, "grad_norm": 3.798107624053955, "learning_rate": 9.83121527232291e-07, "loss": 1.9801, "num_input_tokens_seen": 64356992, "step": 63200 }, { "epoch": 1.2462837904353132, "grad_norm": 1.8981205224990845, "learning_rate": 9.830683674230318e-07, "loss": 2.0256, "num_input_tokens_seen": 64456856, "step": 63300 }, { "epoch": 1.2482526431848162, "grad_norm": 1.7309049367904663, "learning_rate": 9.83015125472071e-07, "loss": 1.9839, "num_input_tokens_seen": 64559256, "step": 63400 }, { "epoch": 1.2502214959343192, "grad_norm": 1.970591425895691, "learning_rate": 9.82961801388462e-07, "loss": 2.0219, "num_input_tokens_seen": 64661656, "step": 63500 }, { "epoch": 1.252190348683822, "grad_norm": 2.072432279586792, "learning_rate": 9.829083951812723e-07, "loss": 2.0168, "num_input_tokens_seen": 64763960, "step": 63600 }, { "epoch": 1.2541592014333247, "grad_norm": 1.9331374168395996, "learning_rate": 9.828549068595829e-07, "loss": 1.9511, "num_input_tokens_seen": 64865648, "step": 63700 }, { "epoch": 1.2561280541828277, "grad_norm": 1.9606348276138306, "learning_rate": 9.828013364324892e-07, "loss": 1.9926, "num_input_tokens_seen": 64967240, "step": 63800 }, { "epoch": 1.2580969069323304, "grad_norm": 1.8813121318817139, "learning_rate": 9.827476839091006e-07, "loss": 1.9805, "num_input_tokens_seen": 65068880, "step": 63900 }, { "epoch": 1.2600657596818334, "grad_norm": 1.8504828214645386, "learning_rate": 9.826939492985398e-07, "loss": 1.9444, "num_input_tokens_seen": 65171280, "step": 64000 }, { "epoch": 1.2620346124313362, "grad_norm": 2.1883697509765625, "learning_rate": 9.826401326099442e-07, "loss": 1.9323, "num_input_tokens_seen": 65273680, "step": 64100 }, { "epoch": 1.2640034651808392, "grad_norm": 2.226407527923584, "learning_rate": 9.825862338524648e-07, "loss": 2.0457, "num_input_tokens_seen": 65375520, "step": 64200 }, { "epoch": 1.265972317930342, "grad_norm": 2.0757224559783936, "learning_rate": 9.825322530352666e-07, "loss": 1.9929, "num_input_tokens_seen": 65477072, "step": 64300 }, { "epoch": 1.267941170679845, "grad_norm": 2.087914228439331, "learning_rate": 9.824781901675287e-07, "loss": 1.9284, "num_input_tokens_seen": 65578792, "step": 64400 }, { "epoch": 1.2699100234293477, "grad_norm": 2.0337281227111816, "learning_rate": 9.82424045258444e-07, "loss": 1.9872, "num_input_tokens_seen": 65681192, "step": 64500 }, { "epoch": 1.2718788761788506, "grad_norm": 2.0307414531707764, "learning_rate": 9.823698183172192e-07, "loss": 2.009, "num_input_tokens_seen": 65782744, "step": 64600 }, { "epoch": 1.2738477289283534, "grad_norm": 1.97044038772583, "learning_rate": 9.823155093530755e-07, "loss": 1.9632, "num_input_tokens_seen": 65885144, "step": 64700 }, { "epoch": 1.2758165816778564, "grad_norm": 2.425098419189453, "learning_rate": 9.822611183752473e-07, "loss": 1.9498, "num_input_tokens_seen": 65987544, "step": 64800 }, { "epoch": 1.2777854344273591, "grad_norm": 1.8895459175109863, "learning_rate": 9.822066453929839e-07, "loss": 2.0062, "num_input_tokens_seen": 66089416, "step": 64900 }, { "epoch": 1.279754287176862, "grad_norm": 2.0372626781463623, "learning_rate": 9.821520904155471e-07, "loss": 1.9604, "num_input_tokens_seen": 66191056, "step": 65000 }, { "epoch": 1.2817231399263649, "grad_norm": 2.016897439956665, "learning_rate": 9.820974534522142e-07, "loss": 1.9932, "num_input_tokens_seen": 66293456, "step": 65100 }, { "epoch": 1.2836919926758679, "grad_norm": 1.8881981372833252, "learning_rate": 9.820427345122755e-07, "loss": 1.9576, "num_input_tokens_seen": 66394368, "step": 65200 }, { "epoch": 1.2856608454253706, "grad_norm": 2.034299612045288, "learning_rate": 9.819879336050355e-07, "loss": 1.9723, "num_input_tokens_seen": 66496664, "step": 65300 }, { "epoch": 1.2876296981748734, "grad_norm": 2.127408981323242, "learning_rate": 9.819330507398127e-07, "loss": 1.9823, "num_input_tokens_seen": 66598288, "step": 65400 }, { "epoch": 1.2895985509243764, "grad_norm": 2.232555866241455, "learning_rate": 9.818780859259395e-07, "loss": 1.9796, "num_input_tokens_seen": 66700688, "step": 65500 }, { "epoch": 1.2915674036738793, "grad_norm": 1.929018259048462, "learning_rate": 9.81823039172762e-07, "loss": 2.0197, "num_input_tokens_seen": 66802256, "step": 65600 }, { "epoch": 1.293536256423382, "grad_norm": 2.0691864490509033, "learning_rate": 9.817679104896407e-07, "loss": 1.9878, "num_input_tokens_seen": 66904656, "step": 65700 }, { "epoch": 1.2955051091728849, "grad_norm": 2.009652853012085, "learning_rate": 9.817126998859493e-07, "loss": 1.9737, "num_input_tokens_seen": 67005536, "step": 65800 }, { "epoch": 1.2974739619223878, "grad_norm": 1.7023661136627197, "learning_rate": 9.816574073710767e-07, "loss": 1.9983, "num_input_tokens_seen": 67105736, "step": 65900 }, { "epoch": 1.2994428146718908, "grad_norm": 1.9258289337158203, "learning_rate": 9.81602032954424e-07, "loss": 1.9838, "num_input_tokens_seen": 67208136, "step": 66000 }, { "epoch": 1.3014116674213936, "grad_norm": 1.9641227722167969, "learning_rate": 9.81546576645408e-07, "loss": 1.9792, "num_input_tokens_seen": 67310536, "step": 66100 }, { "epoch": 1.3033805201708963, "grad_norm": 1.8868162631988525, "learning_rate": 9.81491038453458e-07, "loss": 1.9768, "num_input_tokens_seen": 67412160, "step": 66200 }, { "epoch": 1.3053493729203993, "grad_norm": 1.6703177690505981, "learning_rate": 9.814354183880182e-07, "loss": 1.9902, "num_input_tokens_seen": 67513384, "step": 66300 }, { "epoch": 1.307318225669902, "grad_norm": 1.9915131330490112, "learning_rate": 9.81379716458546e-07, "loss": 1.979, "num_input_tokens_seen": 67614232, "step": 66400 }, { "epoch": 1.309287078419405, "grad_norm": 1.8469550609588623, "learning_rate": 9.813239326745131e-07, "loss": 2.0128, "num_input_tokens_seen": 67716632, "step": 66500 }, { "epoch": 1.3112559311689078, "grad_norm": 1.702362298965454, "learning_rate": 9.812680670454051e-07, "loss": 2.0334, "num_input_tokens_seen": 67818328, "step": 66600 }, { "epoch": 1.3132247839184108, "grad_norm": 2.065275192260742, "learning_rate": 9.812121195807216e-07, "loss": 1.9849, "num_input_tokens_seen": 67919528, "step": 66700 }, { "epoch": 1.3151936366679136, "grad_norm": 1.8248300552368164, "learning_rate": 9.811560902899758e-07, "loss": 2.0113, "num_input_tokens_seen": 68021928, "step": 66800 }, { "epoch": 1.3171624894174165, "grad_norm": 1.6874494552612305, "learning_rate": 9.810999791826953e-07, "loss": 1.9536, "num_input_tokens_seen": 68124328, "step": 66900 }, { "epoch": 1.3191313421669193, "grad_norm": 1.9437017440795898, "learning_rate": 9.81043786268421e-07, "loss": 1.9535, "num_input_tokens_seen": 68225256, "step": 67000 }, { "epoch": 1.3211001949164223, "grad_norm": 1.8744845390319824, "learning_rate": 9.80987511556708e-07, "loss": 1.9658, "num_input_tokens_seen": 68326312, "step": 67100 }, { "epoch": 1.323069047665925, "grad_norm": 1.9006444215774536, "learning_rate": 9.809311550571259e-07, "loss": 1.9966, "num_input_tokens_seen": 68428712, "step": 67200 }, { "epoch": 1.325037900415428, "grad_norm": 2.078761339187622, "learning_rate": 9.808747167792567e-07, "loss": 1.983, "num_input_tokens_seen": 68529312, "step": 67300 }, { "epoch": 1.3270067531649308, "grad_norm": 2.2462334632873535, "learning_rate": 9.808181967326983e-07, "loss": 1.9474, "num_input_tokens_seen": 68631264, "step": 67400 }, { "epoch": 1.3289756059144335, "grad_norm": 2.1578781604766846, "learning_rate": 9.807615949270606e-07, "loss": 1.9404, "num_input_tokens_seen": 68733664, "step": 67500 }, { "epoch": 1.3309444586639365, "grad_norm": 1.954037070274353, "learning_rate": 9.807049113719687e-07, "loss": 1.9482, "num_input_tokens_seen": 68835272, "step": 67600 }, { "epoch": 1.3329133114134395, "grad_norm": 2.041626453399658, "learning_rate": 9.806481460770613e-07, "loss": 1.9334, "num_input_tokens_seen": 68937672, "step": 67700 }, { "epoch": 1.3348821641629423, "grad_norm": 1.780411720275879, "learning_rate": 9.805912990519903e-07, "loss": 1.95, "num_input_tokens_seen": 69039592, "step": 67800 }, { "epoch": 1.336851016912445, "grad_norm": 7.53485631942749, "learning_rate": 9.805343703064226e-07, "loss": 1.97, "num_input_tokens_seen": 69141136, "step": 67900 }, { "epoch": 1.338819869661948, "grad_norm": 1.8268909454345703, "learning_rate": 9.804773598500381e-07, "loss": 1.9966, "num_input_tokens_seen": 69243536, "step": 68000 }, { "epoch": 1.340788722411451, "grad_norm": 1.770240068435669, "learning_rate": 9.804202676925313e-07, "loss": 1.981, "num_input_tokens_seen": 69345808, "step": 68100 }, { "epoch": 1.3427575751609537, "grad_norm": 1.959314227104187, "learning_rate": 9.8036309384361e-07, "loss": 1.9632, "num_input_tokens_seen": 69447728, "step": 68200 }, { "epoch": 1.3447264279104565, "grad_norm": 1.9582931995391846, "learning_rate": 9.803058383129958e-07, "loss": 1.9684, "num_input_tokens_seen": 69548456, "step": 68300 }, { "epoch": 1.3466952806599595, "grad_norm": 1.813268780708313, "learning_rate": 9.802485011104254e-07, "loss": 1.9948, "num_input_tokens_seen": 69649928, "step": 68400 }, { "epoch": 1.3486641334094622, "grad_norm": 2.0228004455566406, "learning_rate": 9.801910822456476e-07, "loss": 1.9943, "num_input_tokens_seen": 69750912, "step": 68500 }, { "epoch": 1.3506329861589652, "grad_norm": 1.9284993410110474, "learning_rate": 9.801335817284266e-07, "loss": 1.9669, "num_input_tokens_seen": 69852200, "step": 68600 }, { "epoch": 1.352601838908468, "grad_norm": 2.1699578762054443, "learning_rate": 9.800759995685395e-07, "loss": 1.9891, "num_input_tokens_seen": 69952848, "step": 68700 }, { "epoch": 1.354570691657971, "grad_norm": 2.152777910232544, "learning_rate": 9.800183357757778e-07, "loss": 2.0238, "num_input_tokens_seen": 70054280, "step": 68800 }, { "epoch": 1.3565395444074737, "grad_norm": 1.7112054824829102, "learning_rate": 9.79960590359947e-07, "loss": 1.9838, "num_input_tokens_seen": 70155976, "step": 68900 }, { "epoch": 1.3585083971569767, "grad_norm": 1.9984151124954224, "learning_rate": 9.799027633308658e-07, "loss": 1.9918, "num_input_tokens_seen": 70258376, "step": 69000 }, { "epoch": 1.3604772499064794, "grad_norm": 2.0464658737182617, "learning_rate": 9.798448546983674e-07, "loss": 1.9936, "num_input_tokens_seen": 70358560, "step": 69100 }, { "epoch": 1.3624461026559824, "grad_norm": 1.952075719833374, "learning_rate": 9.797868644722987e-07, "loss": 1.9968, "num_input_tokens_seen": 70460960, "step": 69200 }, { "epoch": 1.3644149554054852, "grad_norm": 1.8766202926635742, "learning_rate": 9.797287926625203e-07, "loss": 1.9406, "num_input_tokens_seen": 70562768, "step": 69300 }, { "epoch": 1.3663838081549882, "grad_norm": 1.9930871725082397, "learning_rate": 9.796706392789072e-07, "loss": 1.9732, "num_input_tokens_seen": 70663960, "step": 69400 }, { "epoch": 1.368352660904491, "grad_norm": 1.9259198904037476, "learning_rate": 9.796124043313474e-07, "loss": 1.9615, "num_input_tokens_seen": 70765448, "step": 69500 }, { "epoch": 1.3703215136539937, "grad_norm": 1.7779191732406616, "learning_rate": 9.795540878297434e-07, "loss": 1.9313, "num_input_tokens_seen": 70867144, "step": 69600 }, { "epoch": 1.3722903664034967, "grad_norm": 1.705250859260559, "learning_rate": 9.794956897840118e-07, "loss": 1.9658, "num_input_tokens_seen": 70968928, "step": 69700 }, { "epoch": 1.3742592191529996, "grad_norm": 1.8308429718017578, "learning_rate": 9.794372102040822e-07, "loss": 1.9558, "num_input_tokens_seen": 71071328, "step": 69800 }, { "epoch": 1.3762280719025024, "grad_norm": 1.8280293941497803, "learning_rate": 9.793786490998987e-07, "loss": 1.9873, "num_input_tokens_seen": 71171512, "step": 69900 }, { "epoch": 1.3781969246520052, "grad_norm": 2.1454601287841797, "learning_rate": 9.793200064814193e-07, "loss": 1.9997, "num_input_tokens_seen": 71273208, "step": 70000 }, { "epoch": 1.3801657774015081, "grad_norm": 2.003838539123535, "learning_rate": 9.792612823586158e-07, "loss": 1.9791, "num_input_tokens_seen": 71375608, "step": 70100 }, { "epoch": 1.3821346301510111, "grad_norm": 1.8229576349258423, "learning_rate": 9.792024767414731e-07, "loss": 1.9928, "num_input_tokens_seen": 71477360, "step": 70200 }, { "epoch": 1.3841034829005139, "grad_norm": 1.9113233089447021, "learning_rate": 9.791435896399913e-07, "loss": 1.9577, "num_input_tokens_seen": 71579760, "step": 70300 }, { "epoch": 1.3860723356500166, "grad_norm": 1.8677958250045776, "learning_rate": 9.790846210641832e-07, "loss": 1.9734, "num_input_tokens_seen": 71682160, "step": 70400 }, { "epoch": 1.3880411883995196, "grad_norm": 2.3678808212280273, "learning_rate": 9.79025571024076e-07, "loss": 2.0064, "num_input_tokens_seen": 71783792, "step": 70500 }, { "epoch": 1.3900100411490226, "grad_norm": 2.2890989780426025, "learning_rate": 9.789664395297109e-07, "loss": 2.0239, "num_input_tokens_seen": 71885384, "step": 70600 }, { "epoch": 1.3919788938985254, "grad_norm": 1.7234455347061157, "learning_rate": 9.789072265911426e-07, "loss": 1.9708, "num_input_tokens_seen": 71987128, "step": 70700 }, { "epoch": 1.3939477466480281, "grad_norm": 1.7842670679092407, "learning_rate": 9.788479322184395e-07, "loss": 1.9854, "num_input_tokens_seen": 72088096, "step": 70800 }, { "epoch": 1.395916599397531, "grad_norm": 1.779493808746338, "learning_rate": 9.787885564216844e-07, "loss": 2.0041, "num_input_tokens_seen": 72189944, "step": 70900 }, { "epoch": 1.3978854521470339, "grad_norm": 1.9403914213180542, "learning_rate": 9.787290992109737e-07, "loss": 1.9884, "num_input_tokens_seen": 72290592, "step": 71000 }, { "epoch": 1.3998543048965368, "grad_norm": 2.0947275161743164, "learning_rate": 9.786695605964175e-07, "loss": 1.9893, "num_input_tokens_seen": 72391536, "step": 71100 }, { "epoch": 1.4018231576460396, "grad_norm": 1.8270337581634521, "learning_rate": 9.786099405881395e-07, "loss": 1.9986, "num_input_tokens_seen": 72492576, "step": 71200 }, { "epoch": 1.4037920103955426, "grad_norm": 2.048389196395874, "learning_rate": 9.785502391962781e-07, "loss": 1.9589, "num_input_tokens_seen": 72593528, "step": 71300 }, { "epoch": 1.4057608631450453, "grad_norm": 1.9211962223052979, "learning_rate": 9.78490456430985e-07, "loss": 1.9909, "num_input_tokens_seen": 72695928, "step": 71400 }, { "epoch": 1.4077297158945483, "grad_norm": 1.9742146730422974, "learning_rate": 9.78430592302425e-07, "loss": 2.0325, "num_input_tokens_seen": 72797424, "step": 71500 }, { "epoch": 1.409698568644051, "grad_norm": 1.8823710680007935, "learning_rate": 9.783706468207784e-07, "loss": 1.9457, "num_input_tokens_seen": 72899824, "step": 71600 }, { "epoch": 1.411667421393554, "grad_norm": 1.8808926343917847, "learning_rate": 9.783106199962381e-07, "loss": 2.0035, "num_input_tokens_seen": 73000784, "step": 71700 }, { "epoch": 1.4136362741430568, "grad_norm": 2.0258290767669678, "learning_rate": 9.782505118390109e-07, "loss": 1.9815, "num_input_tokens_seen": 73102680, "step": 71800 }, { "epoch": 1.4156051268925598, "grad_norm": 1.8572781085968018, "learning_rate": 9.781903223593182e-07, "loss": 1.9955, "num_input_tokens_seen": 73205080, "step": 71900 }, { "epoch": 1.4175739796420626, "grad_norm": 2.2411041259765625, "learning_rate": 9.781300515673942e-07, "loss": 2.0017, "num_input_tokens_seen": 73306728, "step": 72000 }, { "epoch": 1.4195428323915653, "grad_norm": 2.064722776412964, "learning_rate": 9.78069699473488e-07, "loss": 1.9744, "num_input_tokens_seen": 73409128, "step": 72100 }, { "epoch": 1.4215116851410683, "grad_norm": 2.029839515686035, "learning_rate": 9.780092660878614e-07, "loss": 1.9744, "num_input_tokens_seen": 73511032, "step": 72200 }, { "epoch": 1.4234805378905713, "grad_norm": 1.7096360921859741, "learning_rate": 9.779487514207906e-07, "loss": 2.0243, "num_input_tokens_seen": 73612768, "step": 72300 }, { "epoch": 1.425449390640074, "grad_norm": 1.8959747552871704, "learning_rate": 9.778881554825662e-07, "loss": 1.9732, "num_input_tokens_seen": 73714232, "step": 72400 }, { "epoch": 1.4274182433895768, "grad_norm": 2.193753957748413, "learning_rate": 9.778274782834915e-07, "loss": 1.9609, "num_input_tokens_seen": 73816232, "step": 72500 }, { "epoch": 1.4293870961390798, "grad_norm": 1.890655755996704, "learning_rate": 9.777667198338843e-07, "loss": 1.9909, "num_input_tokens_seen": 73918632, "step": 72600 }, { "epoch": 1.4313559488885828, "grad_norm": 1.971684455871582, "learning_rate": 9.77705880144076e-07, "loss": 2.0129, "num_input_tokens_seen": 74020240, "step": 72700 }, { "epoch": 1.4333248016380855, "grad_norm": 1.9532390832901, "learning_rate": 9.77644959224412e-07, "loss": 1.9846, "num_input_tokens_seen": 74119088, "step": 72800 }, { "epoch": 1.4352936543875883, "grad_norm": 2.263477325439453, "learning_rate": 9.775839570852512e-07, "loss": 2.0401, "num_input_tokens_seen": 74221488, "step": 72900 }, { "epoch": 1.4372625071370912, "grad_norm": 1.9588102102279663, "learning_rate": 9.77522873736967e-07, "loss": 2.0336, "num_input_tokens_seen": 74322552, "step": 73000 }, { "epoch": 1.439231359886594, "grad_norm": 2.2284955978393555, "learning_rate": 9.774617091899455e-07, "loss": 1.9896, "num_input_tokens_seen": 74424288, "step": 73100 }, { "epoch": 1.441200212636097, "grad_norm": 1.8370540142059326, "learning_rate": 9.774004634545874e-07, "loss": 2.0052, "num_input_tokens_seen": 74526688, "step": 73200 }, { "epoch": 1.4431690653855997, "grad_norm": 2.0275652408599854, "learning_rate": 9.77339136541307e-07, "loss": 2.0013, "num_input_tokens_seen": 74628320, "step": 73300 }, { "epoch": 1.4451379181351027, "grad_norm": 2.0400390625, "learning_rate": 9.772777284605327e-07, "loss": 1.9748, "num_input_tokens_seen": 74729792, "step": 73400 }, { "epoch": 1.4471067708846055, "grad_norm": 1.8525439500808716, "learning_rate": 9.772162392227062e-07, "loss": 2.0092, "num_input_tokens_seen": 74831344, "step": 73500 }, { "epoch": 1.4490756236341085, "grad_norm": 1.9950172901153564, "learning_rate": 9.771546688382831e-07, "loss": 2.0175, "num_input_tokens_seen": 74931784, "step": 73600 }, { "epoch": 1.4510444763836112, "grad_norm": 1.8894357681274414, "learning_rate": 9.770930173177332e-07, "loss": 1.9752, "num_input_tokens_seen": 75034184, "step": 73700 }, { "epoch": 1.4530133291331142, "grad_norm": 2.0317158699035645, "learning_rate": 9.770312846715397e-07, "loss": 1.9799, "num_input_tokens_seen": 75135936, "step": 73800 }, { "epoch": 1.454982181882617, "grad_norm": 1.9283066987991333, "learning_rate": 9.769694709101997e-07, "loss": 1.9754, "num_input_tokens_seen": 75237440, "step": 73900 }, { "epoch": 1.45695103463212, "grad_norm": 1.9150584936141968, "learning_rate": 9.76907576044224e-07, "loss": 1.9905, "num_input_tokens_seen": 75338424, "step": 74000 }, { "epoch": 1.4589198873816227, "grad_norm": 1.8755513429641724, "learning_rate": 9.768456000841374e-07, "loss": 1.9839, "num_input_tokens_seen": 75440200, "step": 74100 }, { "epoch": 1.4608887401311255, "grad_norm": 2.5257606506347656, "learning_rate": 9.767835430404785e-07, "loss": 1.9775, "num_input_tokens_seen": 75542600, "step": 74200 }, { "epoch": 1.4628575928806284, "grad_norm": 2.086940050125122, "learning_rate": 9.767214049237996e-07, "loss": 1.9546, "num_input_tokens_seen": 75644952, "step": 74300 }, { "epoch": 1.4648264456301314, "grad_norm": 1.6934620141983032, "learning_rate": 9.766591857446664e-07, "loss": 1.9341, "num_input_tokens_seen": 75747352, "step": 74400 }, { "epoch": 1.4667952983796342, "grad_norm": 1.8574316501617432, "learning_rate": 9.765968855136593e-07, "loss": 1.9894, "num_input_tokens_seen": 75849752, "step": 74500 }, { "epoch": 1.468764151129137, "grad_norm": 2.30375075340271, "learning_rate": 9.765345042413713e-07, "loss": 1.9596, "num_input_tokens_seen": 75951408, "step": 74600 }, { "epoch": 1.47073300387864, "grad_norm": 1.8743842840194702, "learning_rate": 9.764720419384102e-07, "loss": 2.0778, "num_input_tokens_seen": 76052984, "step": 74700 }, { "epoch": 1.472701856628143, "grad_norm": 2.6566689014434814, "learning_rate": 9.764094986153972e-07, "loss": 2.046, "num_input_tokens_seen": 76155208, "step": 74800 }, { "epoch": 1.4746707093776457, "grad_norm": 2.362420082092285, "learning_rate": 9.763468742829673e-07, "loss": 1.9592, "num_input_tokens_seen": 76257608, "step": 74900 }, { "epoch": 1.4766395621271484, "grad_norm": 2.2277305126190186, "learning_rate": 9.76284168951769e-07, "loss": 1.9825, "num_input_tokens_seen": 76360008, "step": 75000 }, { "epoch": 1.4786084148766514, "grad_norm": 2.019000768661499, "learning_rate": 9.76221382632465e-07, "loss": 1.9805, "num_input_tokens_seen": 76461784, "step": 75100 }, { "epoch": 1.4805772676261542, "grad_norm": 2.016059160232544, "learning_rate": 9.761585153357315e-07, "loss": 1.9806, "num_input_tokens_seen": 76563816, "step": 75200 }, { "epoch": 1.4825461203756571, "grad_norm": 2.0543510913848877, "learning_rate": 9.760955670722584e-07, "loss": 1.9819, "num_input_tokens_seen": 76666216, "step": 75300 }, { "epoch": 1.48451497312516, "grad_norm": 1.8296061754226685, "learning_rate": 9.760325378527498e-07, "loss": 1.944, "num_input_tokens_seen": 76768616, "step": 75400 }, { "epoch": 1.4864838258746629, "grad_norm": 1.9528096914291382, "learning_rate": 9.759694276879233e-07, "loss": 1.974, "num_input_tokens_seen": 76871016, "step": 75500 }, { "epoch": 1.4884526786241656, "grad_norm": 1.9242920875549316, "learning_rate": 9.7590623658851e-07, "loss": 2.0285, "num_input_tokens_seen": 76973416, "step": 75600 }, { "epoch": 1.4904215313736686, "grad_norm": 2.241938829421997, "learning_rate": 9.758429645652552e-07, "loss": 1.9852, "num_input_tokens_seen": 77074376, "step": 75700 }, { "epoch": 1.4923903841231714, "grad_norm": 2.608829975128174, "learning_rate": 9.757796116289176e-07, "loss": 1.9513, "num_input_tokens_seen": 77175984, "step": 75800 }, { "epoch": 1.4943592368726744, "grad_norm": 2.105858087539673, "learning_rate": 9.757161777902703e-07, "loss": 1.9967, "num_input_tokens_seen": 77277296, "step": 75900 }, { "epoch": 1.4963280896221771, "grad_norm": 2.0412871837615967, "learning_rate": 9.75652663060099e-07, "loss": 2.0007, "num_input_tokens_seen": 77379448, "step": 76000 }, { "epoch": 1.49829694237168, "grad_norm": 1.908644199371338, "learning_rate": 9.755890674492043e-07, "loss": 2.004, "num_input_tokens_seen": 77480992, "step": 76100 }, { "epoch": 1.5002657951211829, "grad_norm": 1.8958549499511719, "learning_rate": 9.755253909683999e-07, "loss": 2.0009, "num_input_tokens_seen": 77583392, "step": 76200 }, { "epoch": 1.5022346478706856, "grad_norm": 1.7821376323699951, "learning_rate": 9.754616336285136e-07, "loss": 1.973, "num_input_tokens_seen": 77684488, "step": 76300 }, { "epoch": 1.5042035006201886, "grad_norm": 2.1679296493530273, "learning_rate": 9.753977954403868e-07, "loss": 1.9916, "num_input_tokens_seen": 77785016, "step": 76400 }, { "epoch": 1.5061723533696916, "grad_norm": 1.8659836053848267, "learning_rate": 9.753338764148745e-07, "loss": 1.9824, "num_input_tokens_seen": 77887416, "step": 76500 }, { "epoch": 1.5081412061191943, "grad_norm": 1.8900649547576904, "learning_rate": 9.752698765628456e-07, "loss": 1.9843, "num_input_tokens_seen": 77989264, "step": 76600 }, { "epoch": 1.510110058868697, "grad_norm": 2.197361946105957, "learning_rate": 9.75205795895183e-07, "loss": 1.9854, "num_input_tokens_seen": 78090520, "step": 76700 }, { "epoch": 1.5120789116182, "grad_norm": 1.8914963006973267, "learning_rate": 9.751416344227828e-07, "loss": 1.9851, "num_input_tokens_seen": 78192232, "step": 76800 }, { "epoch": 1.514047764367703, "grad_norm": 2.121875762939453, "learning_rate": 9.750773921565551e-07, "loss": 2.0401, "num_input_tokens_seen": 78294632, "step": 76900 }, { "epoch": 1.5160166171172058, "grad_norm": 1.6164422035217285, "learning_rate": 9.75013069107424e-07, "loss": 1.9996, "num_input_tokens_seen": 78396368, "step": 77000 }, { "epoch": 1.5179854698667086, "grad_norm": 1.7804019451141357, "learning_rate": 9.749486652863268e-07, "loss": 1.9095, "num_input_tokens_seen": 78498216, "step": 77100 }, { "epoch": 1.5199543226162116, "grad_norm": 1.8734009265899658, "learning_rate": 9.74884180704215e-07, "loss": 2.0188, "num_input_tokens_seen": 78599792, "step": 77200 }, { "epoch": 1.5219231753657145, "grad_norm": 1.8652548789978027, "learning_rate": 9.748196153720536e-07, "loss": 2.0122, "num_input_tokens_seen": 78702192, "step": 77300 }, { "epoch": 1.5238920281152173, "grad_norm": 1.9243494272232056, "learning_rate": 9.747549693008214e-07, "loss": 1.9853, "num_input_tokens_seen": 78804592, "step": 77400 }, { "epoch": 1.52586088086472, "grad_norm": 2.356846332550049, "learning_rate": 9.746902425015112e-07, "loss": 2.0327, "num_input_tokens_seen": 78906064, "step": 77500 }, { "epoch": 1.527829733614223, "grad_norm": 10.839113235473633, "learning_rate": 9.746254349851286e-07, "loss": 2.0012, "num_input_tokens_seen": 79008024, "step": 77600 }, { "epoch": 1.529798586363726, "grad_norm": 1.8322460651397705, "learning_rate": 9.745605467626943e-07, "loss": 2.0063, "num_input_tokens_seen": 79110120, "step": 77700 }, { "epoch": 1.5317674391132288, "grad_norm": 1.9234142303466797, "learning_rate": 9.744955778452414e-07, "loss": 1.947, "num_input_tokens_seen": 79211944, "step": 77800 }, { "epoch": 1.5337362918627315, "grad_norm": 1.7628679275512695, "learning_rate": 9.744305282438177e-07, "loss": 1.9934, "num_input_tokens_seen": 79312744, "step": 77900 }, { "epoch": 1.5357051446122343, "grad_norm": 2.5412709712982178, "learning_rate": 9.743653979694841e-07, "loss": 1.9765, "num_input_tokens_seen": 79415144, "step": 78000 }, { "epoch": 1.5376739973617373, "grad_norm": 2.0403952598571777, "learning_rate": 9.743001870333156e-07, "loss": 2.0687, "num_input_tokens_seen": 79517256, "step": 78100 }, { "epoch": 1.5396428501112402, "grad_norm": 2.046607255935669, "learning_rate": 9.742348954464007e-07, "loss": 1.9899, "num_input_tokens_seen": 79619112, "step": 78200 }, { "epoch": 1.541611702860743, "grad_norm": 2.3528730869293213, "learning_rate": 9.74169523219842e-07, "loss": 1.9849, "num_input_tokens_seen": 79720952, "step": 78300 }, { "epoch": 1.5435805556102458, "grad_norm": 1.8831902742385864, "learning_rate": 9.74104070364755e-07, "loss": 1.9927, "num_input_tokens_seen": 79821640, "step": 78400 }, { "epoch": 1.5455494083597487, "grad_norm": 2.3101558685302734, "learning_rate": 9.740385368922695e-07, "loss": 1.9666, "num_input_tokens_seen": 79924040, "step": 78500 }, { "epoch": 1.5475182611092517, "grad_norm": 2.005356550216675, "learning_rate": 9.739729228135291e-07, "loss": 1.9533, "num_input_tokens_seen": 80026440, "step": 78600 }, { "epoch": 1.5494871138587545, "grad_norm": 1.7179080247879028, "learning_rate": 9.739072281396908e-07, "loss": 1.9791, "num_input_tokens_seen": 80128840, "step": 78700 }, { "epoch": 1.5514559666082572, "grad_norm": 2.230632781982422, "learning_rate": 9.738414528819257e-07, "loss": 2.0118, "num_input_tokens_seen": 80229680, "step": 78800 }, { "epoch": 1.5534248193577602, "grad_norm": 1.7507121562957764, "learning_rate": 9.737755970514179e-07, "loss": 1.9749, "num_input_tokens_seen": 80331384, "step": 78900 }, { "epoch": 1.5553936721072632, "grad_norm": 1.8396021127700806, "learning_rate": 9.737096606593658e-07, "loss": 1.9282, "num_input_tokens_seen": 80433784, "step": 79000 }, { "epoch": 1.557362524856766, "grad_norm": 2.0760533809661865, "learning_rate": 9.736436437169813e-07, "loss": 2.0078, "num_input_tokens_seen": 80535416, "step": 79100 }, { "epoch": 1.5593313776062687, "grad_norm": 1.8407785892486572, "learning_rate": 9.735775462354904e-07, "loss": 2.0361, "num_input_tokens_seen": 80637360, "step": 79200 }, { "epoch": 1.5613002303557717, "grad_norm": 1.8386290073394775, "learning_rate": 9.735113682261318e-07, "loss": 1.9458, "num_input_tokens_seen": 80739160, "step": 79300 }, { "epoch": 1.5632690831052747, "grad_norm": 2.1957356929779053, "learning_rate": 9.734451097001588e-07, "loss": 2.0063, "num_input_tokens_seen": 80840392, "step": 79400 }, { "epoch": 1.5652379358547774, "grad_norm": 2.0933775901794434, "learning_rate": 9.733787706688383e-07, "loss": 1.9894, "num_input_tokens_seen": 80942792, "step": 79500 }, { "epoch": 1.5672067886042802, "grad_norm": 1.7478200197219849, "learning_rate": 9.733123511434504e-07, "loss": 2.0121, "num_input_tokens_seen": 81044760, "step": 79600 }, { "epoch": 1.5691756413537832, "grad_norm": 1.8533891439437866, "learning_rate": 9.732458511352894e-07, "loss": 2.0458, "num_input_tokens_seen": 81146360, "step": 79700 }, { "epoch": 1.5711444941032862, "grad_norm": 1.640206217765808, "learning_rate": 9.73179270655663e-07, "loss": 1.9348, "num_input_tokens_seen": 81248760, "step": 79800 }, { "epoch": 1.573113346852789, "grad_norm": 2.1212682723999023, "learning_rate": 9.731126097158923e-07, "loss": 1.9619, "num_input_tokens_seen": 81351160, "step": 79900 }, { "epoch": 1.5750821996022917, "grad_norm": 1.9808043241500854, "learning_rate": 9.73045868327313e-07, "loss": 2.0226, "num_input_tokens_seen": 81452800, "step": 80000 }, { "epoch": 1.5770510523517947, "grad_norm": 2.1366357803344727, "learning_rate": 9.729790465012735e-07, "loss": 1.9838, "num_input_tokens_seen": 81554072, "step": 80100 }, { "epoch": 1.5790199051012976, "grad_norm": 1.896931767463684, "learning_rate": 9.729121442491366e-07, "loss": 1.9746, "num_input_tokens_seen": 81656472, "step": 80200 }, { "epoch": 1.5809887578508004, "grad_norm": 1.8604445457458496, "learning_rate": 9.728451615822782e-07, "loss": 1.9403, "num_input_tokens_seen": 81758424, "step": 80300 }, { "epoch": 1.5829576106003032, "grad_norm": 1.9297837018966675, "learning_rate": 9.727780985120882e-07, "loss": 1.9945, "num_input_tokens_seen": 81860824, "step": 80400 }, { "epoch": 1.584926463349806, "grad_norm": 2.1828677654266357, "learning_rate": 9.727109550499701e-07, "loss": 2.0118, "num_input_tokens_seen": 81962864, "step": 80500 }, { "epoch": 1.586895316099309, "grad_norm": 2.0296082496643066, "learning_rate": 9.726437312073413e-07, "loss": 2.0029, "num_input_tokens_seen": 82065264, "step": 80600 }, { "epoch": 1.5888641688488119, "grad_norm": 1.9530014991760254, "learning_rate": 9.725764269956322e-07, "loss": 2.0214, "num_input_tokens_seen": 82166888, "step": 80700 }, { "epoch": 1.5908330215983146, "grad_norm": 1.8273735046386719, "learning_rate": 9.725090424262877e-07, "loss": 2.0482, "num_input_tokens_seen": 82268104, "step": 80800 }, { "epoch": 1.5928018743478174, "grad_norm": 21.396066665649414, "learning_rate": 9.72441577510766e-07, "loss": 1.9227, "num_input_tokens_seen": 82370504, "step": 80900 }, { "epoch": 1.5947707270973204, "grad_norm": 1.795525074005127, "learning_rate": 9.723740322605387e-07, "loss": 1.9769, "num_input_tokens_seen": 82472904, "step": 81000 }, { "epoch": 1.5967395798468234, "grad_norm": 1.9250332117080688, "learning_rate": 9.723064066870915e-07, "loss": 1.9725, "num_input_tokens_seen": 82575304, "step": 81100 }, { "epoch": 1.5987084325963261, "grad_norm": 2.055537223815918, "learning_rate": 9.722387008019233e-07, "loss": 1.9381, "num_input_tokens_seen": 82677088, "step": 81200 }, { "epoch": 1.6006772853458289, "grad_norm": 2.0581607818603516, "learning_rate": 9.721709146165472e-07, "loss": 2.0336, "num_input_tokens_seen": 82778960, "step": 81300 }, { "epoch": 1.6026461380953319, "grad_norm": 1.6808435916900635, "learning_rate": 9.721030481424894e-07, "loss": 1.9552, "num_input_tokens_seen": 82881360, "step": 81400 }, { "epoch": 1.6046149908448348, "grad_norm": 1.754149317741394, "learning_rate": 9.720351013912904e-07, "loss": 1.9766, "num_input_tokens_seen": 82983016, "step": 81500 }, { "epoch": 1.6065838435943376, "grad_norm": 1.7373837232589722, "learning_rate": 9.719670743745035e-07, "loss": 2.0009, "num_input_tokens_seen": 83084648, "step": 81600 }, { "epoch": 1.6085526963438403, "grad_norm": 2.476461172103882, "learning_rate": 9.718989671036964e-07, "loss": 2.0358, "num_input_tokens_seen": 83184536, "step": 81700 }, { "epoch": 1.6105215490933433, "grad_norm": 2.0237913131713867, "learning_rate": 9.7183077959045e-07, "loss": 1.9746, "num_input_tokens_seen": 83285576, "step": 81800 }, { "epoch": 1.6124904018428463, "grad_norm": 1.7231385707855225, "learning_rate": 9.717625118463596e-07, "loss": 2.0006, "num_input_tokens_seen": 83387664, "step": 81900 }, { "epoch": 1.614459254592349, "grad_norm": 2.1230030059814453, "learning_rate": 9.716941638830327e-07, "loss": 1.9574, "num_input_tokens_seen": 83489336, "step": 82000 }, { "epoch": 1.6164281073418518, "grad_norm": 2.5109426975250244, "learning_rate": 9.716257357120917e-07, "loss": 1.985, "num_input_tokens_seen": 83589680, "step": 82100 }, { "epoch": 1.6183969600913548, "grad_norm": 2.1775271892547607, "learning_rate": 9.715572273451724e-07, "loss": 1.9733, "num_input_tokens_seen": 83692080, "step": 82200 }, { "epoch": 1.6203658128408578, "grad_norm": 1.7170239686965942, "learning_rate": 9.714886387939236e-07, "loss": 1.9695, "num_input_tokens_seen": 83793120, "step": 82300 }, { "epoch": 1.6223346655903605, "grad_norm": 1.8938959836959839, "learning_rate": 9.714199700700087e-07, "loss": 1.9293, "num_input_tokens_seen": 83894968, "step": 82400 }, { "epoch": 1.6243035183398633, "grad_norm": 1.83748459815979, "learning_rate": 9.713512211851041e-07, "loss": 1.9694, "num_input_tokens_seen": 83996896, "step": 82500 }, { "epoch": 1.626272371089366, "grad_norm": 3.0680124759674072, "learning_rate": 9.712823921508998e-07, "loss": 2.0562, "num_input_tokens_seen": 84096456, "step": 82600 }, { "epoch": 1.628241223838869, "grad_norm": 1.8784260749816895, "learning_rate": 9.712134829790997e-07, "loss": 1.9984, "num_input_tokens_seen": 84198856, "step": 82700 }, { "epoch": 1.630210076588372, "grad_norm": 2.3644661903381348, "learning_rate": 9.711444936814211e-07, "loss": 1.9305, "num_input_tokens_seen": 84300632, "step": 82800 }, { "epoch": 1.6321789293378748, "grad_norm": 1.8629422187805176, "learning_rate": 9.710754242695952e-07, "loss": 1.9551, "num_input_tokens_seen": 84402584, "step": 82900 }, { "epoch": 1.6341477820873775, "grad_norm": 1.981223464012146, "learning_rate": 9.710062747553668e-07, "loss": 1.9489, "num_input_tokens_seen": 84504984, "step": 83000 }, { "epoch": 1.6361166348368805, "grad_norm": 1.9939093589782715, "learning_rate": 9.70937045150494e-07, "loss": 1.9794, "num_input_tokens_seen": 84606920, "step": 83100 }, { "epoch": 1.6380854875863835, "grad_norm": 1.9648970365524292, "learning_rate": 9.708677354667487e-07, "loss": 2.0041, "num_input_tokens_seen": 84708232, "step": 83200 }, { "epoch": 1.6400543403358863, "grad_norm": 2.2339165210723877, "learning_rate": 9.707983457159164e-07, "loss": 2.0034, "num_input_tokens_seen": 84809944, "step": 83300 }, { "epoch": 1.642023193085389, "grad_norm": 13.141552925109863, "learning_rate": 9.707288759097963e-07, "loss": 2.0143, "num_input_tokens_seen": 84911480, "step": 83400 }, { "epoch": 1.643992045834892, "grad_norm": 1.8389010429382324, "learning_rate": 9.70659326060201e-07, "loss": 1.9945, "num_input_tokens_seen": 85013152, "step": 83500 }, { "epoch": 1.645960898584395, "grad_norm": 1.8468313217163086, "learning_rate": 9.705896961789575e-07, "loss": 1.9712, "num_input_tokens_seen": 85115552, "step": 83600 }, { "epoch": 1.6479297513338977, "grad_norm": 2.0421788692474365, "learning_rate": 9.70519986277905e-07, "loss": 2.0293, "num_input_tokens_seen": 85217256, "step": 83700 }, { "epoch": 1.6498986040834005, "grad_norm": 2.0900497436523438, "learning_rate": 9.704501963688973e-07, "loss": 1.978, "num_input_tokens_seen": 85319104, "step": 83800 }, { "epoch": 1.6518674568329035, "grad_norm": 1.9232817888259888, "learning_rate": 9.703803264638018e-07, "loss": 2.0701, "num_input_tokens_seen": 85421296, "step": 83900 }, { "epoch": 1.6538363095824065, "grad_norm": 1.8633781671524048, "learning_rate": 9.703103765744992e-07, "loss": 1.9357, "num_input_tokens_seen": 85523144, "step": 84000 }, { "epoch": 1.6558051623319092, "grad_norm": 1.8194736242294312, "learning_rate": 9.702403467128838e-07, "loss": 2.0163, "num_input_tokens_seen": 85623960, "step": 84100 }, { "epoch": 1.657774015081412, "grad_norm": 1.9421623945236206, "learning_rate": 9.701702368908636e-07, "loss": 2.0152, "num_input_tokens_seen": 85725696, "step": 84200 }, { "epoch": 1.659742867830915, "grad_norm": 1.9922300577163696, "learning_rate": 9.7010004712036e-07, "loss": 1.9914, "num_input_tokens_seen": 85828096, "step": 84300 }, { "epoch": 1.661711720580418, "grad_norm": 2.143540143966675, "learning_rate": 9.700297774133088e-07, "loss": 2.0373, "num_input_tokens_seen": 85928912, "step": 84400 }, { "epoch": 1.6636805733299207, "grad_norm": 1.841282606124878, "learning_rate": 9.699594277816582e-07, "loss": 2.0367, "num_input_tokens_seen": 86031312, "step": 84500 }, { "epoch": 1.6656494260794235, "grad_norm": 1.9451500177383423, "learning_rate": 9.698889982373707e-07, "loss": 1.9781, "num_input_tokens_seen": 86133192, "step": 84600 }, { "epoch": 1.6676182788289264, "grad_norm": 1.8159606456756592, "learning_rate": 9.698184887924224e-07, "loss": 2.015, "num_input_tokens_seen": 86233248, "step": 84700 }, { "epoch": 1.6695871315784292, "grad_norm": 2.2536284923553467, "learning_rate": 9.697478994588026e-07, "loss": 2.0173, "num_input_tokens_seen": 86335648, "step": 84800 }, { "epoch": 1.6715559843279322, "grad_norm": 2.063523530960083, "learning_rate": 9.696772302485146e-07, "loss": 1.9713, "num_input_tokens_seen": 86438048, "step": 84900 }, { "epoch": 1.673524837077435, "grad_norm": 1.7738423347473145, "learning_rate": 9.69606481173575e-07, "loss": 1.9183, "num_input_tokens_seen": 86539944, "step": 85000 }, { "epoch": 1.6754936898269377, "grad_norm": 2.0155112743377686, "learning_rate": 9.69535652246014e-07, "loss": 1.9736, "num_input_tokens_seen": 86642344, "step": 85100 }, { "epoch": 1.6774625425764407, "grad_norm": 2.012538433074951, "learning_rate": 9.694647434778758e-07, "loss": 1.9544, "num_input_tokens_seen": 86744064, "step": 85200 }, { "epoch": 1.6794313953259437, "grad_norm": 2.0843119621276855, "learning_rate": 9.693937548812175e-07, "loss": 1.9852, "num_input_tokens_seen": 86846464, "step": 85300 }, { "epoch": 1.6814002480754464, "grad_norm": 2.1301987171173096, "learning_rate": 9.693226864681101e-07, "loss": 1.9742, "num_input_tokens_seen": 86948360, "step": 85400 }, { "epoch": 1.6833691008249492, "grad_norm": 2.021069288253784, "learning_rate": 9.692515382506385e-07, "loss": 1.9824, "num_input_tokens_seen": 87050168, "step": 85500 }, { "epoch": 1.6853379535744522, "grad_norm": 2.3929224014282227, "learning_rate": 9.691803102409007e-07, "loss": 1.9663, "num_input_tokens_seen": 87152568, "step": 85600 }, { "epoch": 1.6873068063239551, "grad_norm": 1.986000418663025, "learning_rate": 9.691090024510082e-07, "loss": 2.0377, "num_input_tokens_seen": 87253352, "step": 85700 }, { "epoch": 1.689275659073458, "grad_norm": 1.9733489751815796, "learning_rate": 9.690376148930866e-07, "loss": 1.9643, "num_input_tokens_seen": 87354512, "step": 85800 }, { "epoch": 1.6912445118229606, "grad_norm": 21.12444305419922, "learning_rate": 9.689661475792747e-07, "loss": 1.9621, "num_input_tokens_seen": 87455960, "step": 85900 }, { "epoch": 1.6932133645724636, "grad_norm": 2.2019286155700684, "learning_rate": 9.688946005217247e-07, "loss": 2.0407, "num_input_tokens_seen": 87556448, "step": 86000 }, { "epoch": 1.6951822173219666, "grad_norm": 2.0862855911254883, "learning_rate": 9.688229737326028e-07, "loss": 2.0029, "num_input_tokens_seen": 87658168, "step": 86100 }, { "epoch": 1.6971510700714694, "grad_norm": 1.8821500539779663, "learning_rate": 9.687512672240886e-07, "loss": 2.0032, "num_input_tokens_seen": 87759112, "step": 86200 }, { "epoch": 1.6991199228209721, "grad_norm": 1.727913737297058, "learning_rate": 9.686794810083748e-07, "loss": 1.9832, "num_input_tokens_seen": 87861512, "step": 86300 }, { "epoch": 1.701088775570475, "grad_norm": 5.58336067199707, "learning_rate": 9.686076150976683e-07, "loss": 1.9768, "num_input_tokens_seen": 87961408, "step": 86400 }, { "epoch": 1.703057628319978, "grad_norm": 4.310601234436035, "learning_rate": 9.685356695041894e-07, "loss": 1.9896, "num_input_tokens_seen": 88062232, "step": 86500 }, { "epoch": 1.7050264810694808, "grad_norm": 1.8383711576461792, "learning_rate": 9.684636442401717e-07, "loss": 2.0618, "num_input_tokens_seen": 88162504, "step": 86600 }, { "epoch": 1.7069953338189836, "grad_norm": 1.9602752923965454, "learning_rate": 9.683915393178623e-07, "loss": 2.0026, "num_input_tokens_seen": 88264904, "step": 86700 }, { "epoch": 1.7089641865684866, "grad_norm": 1.879712462425232, "learning_rate": 9.683193547495225e-07, "loss": 1.9409, "num_input_tokens_seen": 88367304, "step": 86800 }, { "epoch": 1.7109330393179896, "grad_norm": 2.159916400909424, "learning_rate": 9.682470905474262e-07, "loss": 1.9363, "num_input_tokens_seen": 88469704, "step": 86900 }, { "epoch": 1.7129018920674923, "grad_norm": 2.1803338527679443, "learning_rate": 9.681747467238616e-07, "loss": 2.0092, "num_input_tokens_seen": 88570816, "step": 87000 }, { "epoch": 1.714870744816995, "grad_norm": 2.102480411529541, "learning_rate": 9.681023232911303e-07, "loss": 2.0043, "num_input_tokens_seen": 88673216, "step": 87100 }, { "epoch": 1.7168395975664978, "grad_norm": 2.1512107849121094, "learning_rate": 9.680298202615469e-07, "loss": 1.9723, "num_input_tokens_seen": 88775616, "step": 87200 }, { "epoch": 1.7188084503160008, "grad_norm": 1.8901636600494385, "learning_rate": 9.6795723764744e-07, "loss": 2.0005, "num_input_tokens_seen": 88878016, "step": 87300 }, { "epoch": 1.7207773030655038, "grad_norm": 1.9403070211410522, "learning_rate": 9.67884575461152e-07, "loss": 1.9812, "num_input_tokens_seen": 88979880, "step": 87400 }, { "epoch": 1.7227461558150066, "grad_norm": 2.147320032119751, "learning_rate": 9.67811833715038e-07, "loss": 1.9936, "num_input_tokens_seen": 89080840, "step": 87500 }, { "epoch": 1.7247150085645093, "grad_norm": 1.8907344341278076, "learning_rate": 9.677390124214676e-07, "loss": 1.9728, "num_input_tokens_seen": 89181928, "step": 87600 }, { "epoch": 1.7266838613140123, "grad_norm": 3.7446982860565186, "learning_rate": 9.676661115928232e-07, "loss": 2.0041, "num_input_tokens_seen": 89284328, "step": 87700 }, { "epoch": 1.7286527140635153, "grad_norm": 1.849755048751831, "learning_rate": 9.67593131241501e-07, "loss": 2.0008, "num_input_tokens_seen": 89386432, "step": 87800 }, { "epoch": 1.730621566813018, "grad_norm": 1.9824796915054321, "learning_rate": 9.675200713799105e-07, "loss": 1.9439, "num_input_tokens_seen": 89488832, "step": 87900 }, { "epoch": 1.7325904195625208, "grad_norm": 2.4449574947357178, "learning_rate": 9.674469320204754e-07, "loss": 1.9409, "num_input_tokens_seen": 89590416, "step": 88000 }, { "epoch": 1.7345592723120238, "grad_norm": 1.7975555658340454, "learning_rate": 9.67373713175632e-07, "loss": 1.9803, "num_input_tokens_seen": 89691896, "step": 88100 }, { "epoch": 1.7365281250615268, "grad_norm": 4.588106632232666, "learning_rate": 9.673004148578308e-07, "loss": 1.9907, "num_input_tokens_seen": 89792872, "step": 88200 }, { "epoch": 1.7384969778110295, "grad_norm": 2.0367724895477295, "learning_rate": 9.672270370795353e-07, "loss": 1.9816, "num_input_tokens_seen": 89894496, "step": 88300 }, { "epoch": 1.7404658305605323, "grad_norm": 1.5424094200134277, "learning_rate": 9.67153579853223e-07, "loss": 2.0191, "num_input_tokens_seen": 89996896, "step": 88400 }, { "epoch": 1.7424346833100353, "grad_norm": 2.260645627975464, "learning_rate": 9.670800431913843e-07, "loss": 1.9791, "num_input_tokens_seen": 90097968, "step": 88500 }, { "epoch": 1.7444035360595382, "grad_norm": 1.9642016887664795, "learning_rate": 9.67006427106524e-07, "loss": 1.9712, "num_input_tokens_seen": 90199632, "step": 88600 }, { "epoch": 1.746372388809041, "grad_norm": 2.2560601234436035, "learning_rate": 9.6693273161116e-07, "loss": 2.022, "num_input_tokens_seen": 90302032, "step": 88700 }, { "epoch": 1.7483412415585438, "grad_norm": 1.8089711666107178, "learning_rate": 9.66858956717823e-07, "loss": 1.9656, "num_input_tokens_seen": 90404432, "step": 88800 }, { "epoch": 1.7503100943080467, "grad_norm": 1.9336212873458862, "learning_rate": 9.66785102439058e-07, "loss": 1.9988, "num_input_tokens_seen": 90506832, "step": 88900 }, { "epoch": 1.7522789470575497, "grad_norm": 2.186123847961426, "learning_rate": 9.667111687874235e-07, "loss": 2.0029, "num_input_tokens_seen": 90608080, "step": 89000 }, { "epoch": 1.7542477998070525, "grad_norm": 2.1508913040161133, "learning_rate": 9.66637155775491e-07, "loss": 1.9678, "num_input_tokens_seen": 90710480, "step": 89100 }, { "epoch": 1.7562166525565552, "grad_norm": 2.0526251792907715, "learning_rate": 9.665630634158463e-07, "loss": 1.9458, "num_input_tokens_seen": 90812880, "step": 89200 }, { "epoch": 1.758185505306058, "grad_norm": 1.6457970142364502, "learning_rate": 9.664888917210876e-07, "loss": 2.0069, "num_input_tokens_seen": 90914512, "step": 89300 }, { "epoch": 1.760154358055561, "grad_norm": 2.02527117729187, "learning_rate": 9.664146407038276e-07, "loss": 2.0099, "num_input_tokens_seen": 91016912, "step": 89400 }, { "epoch": 1.762123210805064, "grad_norm": 1.8556376695632935, "learning_rate": 9.663403103766915e-07, "loss": 1.9936, "num_input_tokens_seen": 91118384, "step": 89500 }, { "epoch": 1.7640920635545667, "grad_norm": 2.298250436782837, "learning_rate": 9.662659007523194e-07, "loss": 1.9865, "num_input_tokens_seen": 91219864, "step": 89600 }, { "epoch": 1.7660609163040695, "grad_norm": 1.9188224077224731, "learning_rate": 9.661914118433632e-07, "loss": 1.9887, "num_input_tokens_seen": 91322264, "step": 89700 }, { "epoch": 1.7680297690535725, "grad_norm": 1.9386297464370728, "learning_rate": 9.661168436624896e-07, "loss": 1.9671, "num_input_tokens_seen": 91423704, "step": 89800 }, { "epoch": 1.7699986218030754, "grad_norm": 2.0172412395477295, "learning_rate": 9.66042196222378e-07, "loss": 1.9824, "num_input_tokens_seen": 91525368, "step": 89900 }, { "epoch": 1.7719674745525782, "grad_norm": 2.1074607372283936, "learning_rate": 9.659674695357219e-07, "loss": 1.9883, "num_input_tokens_seen": 91626848, "step": 90000 }, { "epoch": 1.773936327302081, "grad_norm": 1.7277181148529053, "learning_rate": 9.658926636152277e-07, "loss": 1.984, "num_input_tokens_seen": 91729248, "step": 90100 }, { "epoch": 1.775905180051584, "grad_norm": 2.026073932647705, "learning_rate": 9.658177784736155e-07, "loss": 1.9806, "num_input_tokens_seen": 91830856, "step": 90200 }, { "epoch": 1.777874032801087, "grad_norm": 2.0041587352752686, "learning_rate": 9.65742814123619e-07, "loss": 1.9766, "num_input_tokens_seen": 91932720, "step": 90300 }, { "epoch": 1.7798428855505897, "grad_norm": 1.86166250705719, "learning_rate": 9.656677705779851e-07, "loss": 1.9995, "num_input_tokens_seen": 92035120, "step": 90400 }, { "epoch": 1.7818117383000924, "grad_norm": 1.8686820268630981, "learning_rate": 9.655926478494745e-07, "loss": 1.9849, "num_input_tokens_seen": 92136656, "step": 90500 }, { "epoch": 1.7837805910495954, "grad_norm": 2.093017101287842, "learning_rate": 9.655174459508612e-07, "loss": 1.97, "num_input_tokens_seen": 92239056, "step": 90600 }, { "epoch": 1.7857494437990984, "grad_norm": 2.0866103172302246, "learning_rate": 9.654421648949326e-07, "loss": 2.0135, "num_input_tokens_seen": 92340088, "step": 90700 }, { "epoch": 1.7877182965486011, "grad_norm": 1.6865144968032837, "learning_rate": 9.653668046944895e-07, "loss": 1.9846, "num_input_tokens_seen": 92442488, "step": 90800 }, { "epoch": 1.789687149298104, "grad_norm": 1.776648759841919, "learning_rate": 9.652913653623463e-07, "loss": 2.0015, "num_input_tokens_seen": 92544456, "step": 90900 }, { "epoch": 1.7916560020476069, "grad_norm": 1.8553825616836548, "learning_rate": 9.65215846911331e-07, "loss": 1.9725, "num_input_tokens_seen": 92645928, "step": 91000 }, { "epoch": 1.7936248547971099, "grad_norm": 1.9707273244857788, "learning_rate": 9.651402493542845e-07, "loss": 1.9466, "num_input_tokens_seen": 92748328, "step": 91100 }, { "epoch": 1.7955937075466126, "grad_norm": 1.7398895025253296, "learning_rate": 9.650645727040622e-07, "loss": 1.9408, "num_input_tokens_seen": 92849856, "step": 91200 }, { "epoch": 1.7975625602961154, "grad_norm": 1.9941409826278687, "learning_rate": 9.649888169735316e-07, "loss": 2.0098, "num_input_tokens_seen": 92951760, "step": 91300 }, { "epoch": 1.7995314130456184, "grad_norm": 2.0445821285247803, "learning_rate": 9.649129821755744e-07, "loss": 1.9681, "num_input_tokens_seen": 93053248, "step": 91400 }, { "epoch": 1.8015002657951213, "grad_norm": 1.999802589416504, "learning_rate": 9.648370683230862e-07, "loss": 2.0062, "num_input_tokens_seen": 93155264, "step": 91500 }, { "epoch": 1.803469118544624, "grad_norm": 1.9985079765319824, "learning_rate": 9.64761075428975e-07, "loss": 1.9573, "num_input_tokens_seen": 93257664, "step": 91600 }, { "epoch": 1.8054379712941269, "grad_norm": 1.9642248153686523, "learning_rate": 9.646850035061627e-07, "loss": 1.9518, "num_input_tokens_seen": 93359472, "step": 91700 }, { "epoch": 1.8074068240436296, "grad_norm": 2.118866205215454, "learning_rate": 9.64608852567585e-07, "loss": 1.953, "num_input_tokens_seen": 93461872, "step": 91800 }, { "epoch": 1.8093756767931326, "grad_norm": 2.0875632762908936, "learning_rate": 9.645326226261907e-07, "loss": 2.0197, "num_input_tokens_seen": 93564272, "step": 91900 }, { "epoch": 1.8113445295426356, "grad_norm": 2.062621593475342, "learning_rate": 9.64456313694942e-07, "loss": 1.9998, "num_input_tokens_seen": 93666152, "step": 92000 }, { "epoch": 1.8133133822921383, "grad_norm": 1.7994866371154785, "learning_rate": 9.643799257868144e-07, "loss": 1.9895, "num_input_tokens_seen": 93768552, "step": 92100 }, { "epoch": 1.815282235041641, "grad_norm": 1.7963272333145142, "learning_rate": 9.643034589147974e-07, "loss": 1.9744, "num_input_tokens_seen": 93870192, "step": 92200 }, { "epoch": 1.817251087791144, "grad_norm": 2.3792521953582764, "learning_rate": 9.642269130918933e-07, "loss": 1.9813, "num_input_tokens_seen": 93971720, "step": 92300 }, { "epoch": 1.819219940540647, "grad_norm": 1.9264003038406372, "learning_rate": 9.64150288331118e-07, "loss": 1.9521, "num_input_tokens_seen": 94073592, "step": 92400 }, { "epoch": 1.8211887932901498, "grad_norm": 2.1435394287109375, "learning_rate": 9.640735846455008e-07, "loss": 1.9639, "num_input_tokens_seen": 94174896, "step": 92500 }, { "epoch": 1.8231576460396526, "grad_norm": 1.9926447868347168, "learning_rate": 9.639968020480848e-07, "loss": 2.0172, "num_input_tokens_seen": 94276528, "step": 92600 }, { "epoch": 1.8251264987891556, "grad_norm": 1.8766510486602783, "learning_rate": 9.639199405519264e-07, "loss": 1.9633, "num_input_tokens_seen": 94378928, "step": 92700 }, { "epoch": 1.8270953515386585, "grad_norm": 1.9852813482284546, "learning_rate": 9.638430001700947e-07, "loss": 1.9817, "num_input_tokens_seen": 94480736, "step": 92800 }, { "epoch": 1.8290642042881613, "grad_norm": 1.9052667617797852, "learning_rate": 9.637659809156734e-07, "loss": 1.9597, "num_input_tokens_seen": 94583136, "step": 92900 }, { "epoch": 1.831033057037664, "grad_norm": 1.8089063167572021, "learning_rate": 9.636888828017583e-07, "loss": 1.977, "num_input_tokens_seen": 94684968, "step": 93000 }, { "epoch": 1.833001909787167, "grad_norm": 2.035827159881592, "learning_rate": 9.636117058414597e-07, "loss": 2.0445, "num_input_tokens_seen": 94787368, "step": 93100 }, { "epoch": 1.83497076253667, "grad_norm": 2.105203151702881, "learning_rate": 9.635344500479008e-07, "loss": 2.0241, "num_input_tokens_seen": 94889768, "step": 93200 }, { "epoch": 1.8369396152861728, "grad_norm": 1.791733741760254, "learning_rate": 9.634571154342185e-07, "loss": 1.9712, "num_input_tokens_seen": 94991496, "step": 93300 }, { "epoch": 1.8389084680356755, "grad_norm": 1.906721830368042, "learning_rate": 9.633797020135623e-07, "loss": 1.9808, "num_input_tokens_seen": 95093896, "step": 93400 }, { "epoch": 1.8408773207851785, "grad_norm": 1.7625744342803955, "learning_rate": 9.633022097990964e-07, "loss": 2.0874, "num_input_tokens_seen": 95196296, "step": 93500 }, { "epoch": 1.8428461735346815, "grad_norm": 1.900176763534546, "learning_rate": 9.632246388039973e-07, "loss": 1.9437, "num_input_tokens_seen": 95298072, "step": 93600 }, { "epoch": 1.8448150262841843, "grad_norm": 2.072072982788086, "learning_rate": 9.631469890414555e-07, "loss": 1.9883, "num_input_tokens_seen": 95399152, "step": 93700 }, { "epoch": 1.846783879033687, "grad_norm": 1.7902374267578125, "learning_rate": 9.630692605246745e-07, "loss": 2.0166, "num_input_tokens_seen": 95501552, "step": 93800 }, { "epoch": 1.8487527317831898, "grad_norm": 1.8923031091690063, "learning_rate": 9.629914532668714e-07, "loss": 2.0062, "num_input_tokens_seen": 95602872, "step": 93900 }, { "epoch": 1.8507215845326928, "grad_norm": 1.8135712146759033, "learning_rate": 9.629135672812767e-07, "loss": 1.9453, "num_input_tokens_seen": 95704456, "step": 94000 }, { "epoch": 1.8526904372821957, "grad_norm": 2.003229856491089, "learning_rate": 9.628356025811344e-07, "loss": 1.9747, "num_input_tokens_seen": 95806856, "step": 94100 }, { "epoch": 1.8546592900316985, "grad_norm": 1.997440218925476, "learning_rate": 9.627575591797014e-07, "loss": 1.9613, "num_input_tokens_seen": 95909256, "step": 94200 }, { "epoch": 1.8566281427812013, "grad_norm": 2.084867477416992, "learning_rate": 9.626794370902488e-07, "loss": 1.9573, "num_input_tokens_seen": 96011280, "step": 94300 }, { "epoch": 1.8585969955307042, "grad_norm": 2.002774238586426, "learning_rate": 9.626012363260602e-07, "loss": 2.0107, "num_input_tokens_seen": 96112632, "step": 94400 }, { "epoch": 1.8605658482802072, "grad_norm": 1.995097279548645, "learning_rate": 9.625229569004332e-07, "loss": 2.0194, "num_input_tokens_seen": 96215032, "step": 94500 }, { "epoch": 1.86253470102971, "grad_norm": 2.048365592956543, "learning_rate": 9.624445988266784e-07, "loss": 1.9917, "num_input_tokens_seen": 96317432, "step": 94600 }, { "epoch": 1.8645035537792127, "grad_norm": 1.882153034210205, "learning_rate": 9.6236616211812e-07, "loss": 1.962, "num_input_tokens_seen": 96417968, "step": 94700 }, { "epoch": 1.8664724065287157, "grad_norm": 2.021857261657715, "learning_rate": 9.622876467880955e-07, "loss": 1.9406, "num_input_tokens_seen": 96519552, "step": 94800 }, { "epoch": 1.8684412592782187, "grad_norm": 1.8111686706542969, "learning_rate": 9.62209052849956e-07, "loss": 1.9959, "num_input_tokens_seen": 96621520, "step": 94900 }, { "epoch": 1.8704101120277214, "grad_norm": 1.9492944478988647, "learning_rate": 9.621303803170653e-07, "loss": 1.9945, "num_input_tokens_seen": 96722568, "step": 95000 }, { "epoch": 1.8723789647772242, "grad_norm": 2.197702407836914, "learning_rate": 9.620516292028013e-07, "loss": 2.0119, "num_input_tokens_seen": 96824128, "step": 95100 }, { "epoch": 1.8743478175267272, "grad_norm": 6.745215892791748, "learning_rate": 9.619727995205549e-07, "loss": 1.9748, "num_input_tokens_seen": 96925128, "step": 95200 }, { "epoch": 1.8763166702762302, "grad_norm": 1.797088861465454, "learning_rate": 9.618938912837302e-07, "loss": 1.9338, "num_input_tokens_seen": 97027528, "step": 95300 }, { "epoch": 1.878285523025733, "grad_norm": 2.1103646755218506, "learning_rate": 9.618149045057453e-07, "loss": 1.9623, "num_input_tokens_seen": 97129928, "step": 95400 }, { "epoch": 1.8802543757752357, "grad_norm": 1.8976510763168335, "learning_rate": 9.617358392000308e-07, "loss": 1.9466, "num_input_tokens_seen": 97232328, "step": 95500 }, { "epoch": 1.8822232285247387, "grad_norm": 1.9777162075042725, "learning_rate": 9.616566953800315e-07, "loss": 1.9678, "num_input_tokens_seen": 97334128, "step": 95600 }, { "epoch": 1.8841920812742416, "grad_norm": 0.0, "learning_rate": 9.615774730592048e-07, "loss": 1.9838, "num_input_tokens_seen": 97436528, "step": 95700 }, { "epoch": 1.8861609340237444, "grad_norm": 2.0475876331329346, "learning_rate": 9.61498172251022e-07, "loss": 1.9909, "num_input_tokens_seen": 97537648, "step": 95800 }, { "epoch": 1.8881297867732472, "grad_norm": 1.9707614183425903, "learning_rate": 9.614187929689676e-07, "loss": 1.9675, "num_input_tokens_seen": 97640048, "step": 95900 }, { "epoch": 1.8900986395227501, "grad_norm": 2.077984571456909, "learning_rate": 9.61339335226539e-07, "loss": 1.9872, "num_input_tokens_seen": 97742448, "step": 96000 }, { "epoch": 1.892067492272253, "grad_norm": 1.7685893774032593, "learning_rate": 9.612597990372478e-07, "loss": 1.9564, "num_input_tokens_seen": 97844272, "step": 96100 }, { "epoch": 1.8940363450217559, "grad_norm": 1.7292839288711548, "learning_rate": 9.61180184414618e-07, "loss": 1.9709, "num_input_tokens_seen": 97946672, "step": 96200 }, { "epoch": 1.8960051977712586, "grad_norm": 1.8539750576019287, "learning_rate": 9.611004913721875e-07, "loss": 2.0124, "num_input_tokens_seen": 98048792, "step": 96300 }, { "epoch": 1.8979740505207614, "grad_norm": 2.281482458114624, "learning_rate": 9.61020719923508e-07, "loss": 1.9263, "num_input_tokens_seen": 98151192, "step": 96400 }, { "epoch": 1.8999429032702644, "grad_norm": 2.0945627689361572, "learning_rate": 9.609408700821431e-07, "loss": 1.9979, "num_input_tokens_seen": 98253592, "step": 96500 }, { "epoch": 1.9019117560197674, "grad_norm": 2.0845375061035156, "learning_rate": 9.608609418616712e-07, "loss": 2.0247, "num_input_tokens_seen": 98355992, "step": 96600 }, { "epoch": 1.9038806087692701, "grad_norm": 1.7269103527069092, "learning_rate": 9.607809352756833e-07, "loss": 1.9279, "num_input_tokens_seen": 98457472, "step": 96700 }, { "epoch": 1.9058494615187729, "grad_norm": 1.9988739490509033, "learning_rate": 9.607008503377835e-07, "loss": 2.002, "num_input_tokens_seen": 98559072, "step": 96800 }, { "epoch": 1.9078183142682759, "grad_norm": 1.806113839149475, "learning_rate": 9.6062068706159e-07, "loss": 1.9937, "num_input_tokens_seen": 98661472, "step": 96900 }, { "epoch": 1.9097871670177788, "grad_norm": 2.1428043842315674, "learning_rate": 9.605404454607336e-07, "loss": 1.9797, "num_input_tokens_seen": 98763872, "step": 97000 }, { "epoch": 1.9117560197672816, "grad_norm": 1.758191466331482, "learning_rate": 9.60460125548859e-07, "loss": 1.9668, "num_input_tokens_seen": 98866272, "step": 97100 }, { "epoch": 1.9137248725167844, "grad_norm": 2.152453899383545, "learning_rate": 9.603797273396238e-07, "loss": 1.9813, "num_input_tokens_seen": 98967592, "step": 97200 }, { "epoch": 1.9156937252662873, "grad_norm": 1.9883763790130615, "learning_rate": 9.602992508466988e-07, "loss": 2.0022, "num_input_tokens_seen": 99069328, "step": 97300 }, { "epoch": 1.9176625780157903, "grad_norm": 2.1257636547088623, "learning_rate": 9.602186960837686e-07, "loss": 1.9448, "num_input_tokens_seen": 99171296, "step": 97400 }, { "epoch": 1.919631430765293, "grad_norm": 2.0925965309143066, "learning_rate": 9.601380630645306e-07, "loss": 1.9926, "num_input_tokens_seen": 99272304, "step": 97500 }, { "epoch": 1.9216002835147958, "grad_norm": 1.9280892610549927, "learning_rate": 9.600573518026962e-07, "loss": 1.9754, "num_input_tokens_seen": 99374704, "step": 97600 }, { "epoch": 1.9235691362642988, "grad_norm": 3.2464559078216553, "learning_rate": 9.599765623119894e-07, "loss": 1.9787, "num_input_tokens_seen": 99476232, "step": 97700 }, { "epoch": 1.9255379890138018, "grad_norm": 2.031052827835083, "learning_rate": 9.598956946061477e-07, "loss": 1.997, "num_input_tokens_seen": 99578632, "step": 97800 }, { "epoch": 1.9275068417633046, "grad_norm": 2.3694489002227783, "learning_rate": 9.59814748698922e-07, "loss": 2.0419, "num_input_tokens_seen": 99680224, "step": 97900 }, { "epoch": 1.9294756945128073, "grad_norm": 1.87174391746521, "learning_rate": 9.597337246040768e-07, "loss": 1.9977, "num_input_tokens_seen": 99781520, "step": 98000 }, { "epoch": 1.9314445472623103, "grad_norm": 2.0461645126342773, "learning_rate": 9.59652622335389e-07, "loss": 1.9952, "num_input_tokens_seen": 99883256, "step": 98100 }, { "epoch": 1.9334134000118133, "grad_norm": 1.7510236501693726, "learning_rate": 9.5957144190665e-07, "loss": 1.9985, "num_input_tokens_seen": 99985656, "step": 98200 }, { "epoch": 1.935382252761316, "grad_norm": 1.9866126775741577, "learning_rate": 9.594901833316633e-07, "loss": 1.9399, "num_input_tokens_seen": 100087608, "step": 98300 }, { "epoch": 1.9373511055108188, "grad_norm": 1.7863428592681885, "learning_rate": 9.594088466242465e-07, "loss": 1.9726, "num_input_tokens_seen": 100190008, "step": 98400 }, { "epoch": 1.9393199582603216, "grad_norm": 1.5792447328567505, "learning_rate": 9.593274317982302e-07, "loss": 2.0431, "num_input_tokens_seen": 100291576, "step": 98500 }, { "epoch": 1.9412888110098245, "grad_norm": 26.58302116394043, "learning_rate": 9.592459388674584e-07, "loss": 2.0123, "num_input_tokens_seen": 100393976, "step": 98600 }, { "epoch": 1.9432576637593275, "grad_norm": 1.784316897392273, "learning_rate": 9.59164367845788e-07, "loss": 1.9633, "num_input_tokens_seen": 100496376, "step": 98700 }, { "epoch": 1.9452265165088303, "grad_norm": 1.9430768489837646, "learning_rate": 9.590827187470895e-07, "loss": 2.0061, "num_input_tokens_seen": 100598016, "step": 98800 }, { "epoch": 1.947195369258333, "grad_norm": 1.8332784175872803, "learning_rate": 9.590009915852468e-07, "loss": 1.9904, "num_input_tokens_seen": 100700416, "step": 98900 }, { "epoch": 1.949164222007836, "grad_norm": 1.8319400548934937, "learning_rate": 9.589191863741569e-07, "loss": 1.9649, "num_input_tokens_seen": 100802816, "step": 99000 }, { "epoch": 1.951133074757339, "grad_norm": 2.0436296463012695, "learning_rate": 9.588373031277302e-07, "loss": 2.0067, "num_input_tokens_seen": 100904712, "step": 99100 }, { "epoch": 1.9531019275068418, "grad_norm": 1.6619454622268677, "learning_rate": 9.5875534185989e-07, "loss": 2.052, "num_input_tokens_seen": 101006576, "step": 99200 }, { "epoch": 1.9550707802563445, "grad_norm": 1.8762683868408203, "learning_rate": 9.586733025845732e-07, "loss": 1.9737, "num_input_tokens_seen": 101108408, "step": 99300 }, { "epoch": 1.9570396330058475, "grad_norm": 2.014247179031372, "learning_rate": 9.5859118531573e-07, "loss": 1.984, "num_input_tokens_seen": 101210200, "step": 99400 }, { "epoch": 1.9590084857553505, "grad_norm": 1.9944854974746704, "learning_rate": 9.585089900673235e-07, "loss": 2.0265, "num_input_tokens_seen": 101311160, "step": 99500 }, { "epoch": 1.9609773385048532, "grad_norm": 1.8577845096588135, "learning_rate": 9.584267168533306e-07, "loss": 1.9765, "num_input_tokens_seen": 101412824, "step": 99600 }, { "epoch": 1.962946191254356, "grad_norm": 1.9040420055389404, "learning_rate": 9.583443656877413e-07, "loss": 1.9512, "num_input_tokens_seen": 101515224, "step": 99700 }, { "epoch": 1.964915044003859, "grad_norm": 2.1334547996520996, "learning_rate": 9.582619365845582e-07, "loss": 2.0074, "num_input_tokens_seen": 101617624, "step": 99800 }, { "epoch": 1.966883896753362, "grad_norm": 1.7800005674362183, "learning_rate": 9.581794295577979e-07, "loss": 1.9828, "num_input_tokens_seen": 101720024, "step": 99900 }, { "epoch": 1.9688527495028647, "grad_norm": 1.7841966152191162, "learning_rate": 9.580968446214903e-07, "loss": 1.9944, "num_input_tokens_seen": 101821616, "step": 100000 }, { "epoch": 1.9708216022523675, "grad_norm": 1.8359742164611816, "learning_rate": 9.580141817896779e-07, "loss": 1.9757, "num_input_tokens_seen": 101923464, "step": 100100 }, { "epoch": 1.9727904550018704, "grad_norm": 2.160520553588867, "learning_rate": 9.579314410764168e-07, "loss": 2.0337, "num_input_tokens_seen": 102025192, "step": 100200 }, { "epoch": 1.9747593077513734, "grad_norm": 1.8562108278274536, "learning_rate": 9.578486224957768e-07, "loss": 2.0133, "num_input_tokens_seen": 102127592, "step": 100300 }, { "epoch": 1.9767281605008762, "grad_norm": 2.1131303310394287, "learning_rate": 9.577657260618401e-07, "loss": 1.9646, "num_input_tokens_seen": 102229992, "step": 100400 }, { "epoch": 1.978697013250379, "grad_norm": 1.9324053525924683, "learning_rate": 9.576827517887024e-07, "loss": 2.0036, "num_input_tokens_seen": 102331856, "step": 100500 }, { "epoch": 1.9806658659998817, "grad_norm": 3.1988070011138916, "learning_rate": 9.575996996904733e-07, "loss": 1.996, "num_input_tokens_seen": 102434256, "step": 100600 }, { "epoch": 1.9826347187493847, "grad_norm": 2.1580376625061035, "learning_rate": 9.57516569781275e-07, "loss": 1.9995, "num_input_tokens_seen": 102536296, "step": 100700 }, { "epoch": 1.9846035714988877, "grad_norm": 1.9465465545654297, "learning_rate": 9.574333620752426e-07, "loss": 2.0036, "num_input_tokens_seen": 102638080, "step": 100800 }, { "epoch": 1.9865724242483904, "grad_norm": 2.174964189529419, "learning_rate": 9.573500765865253e-07, "loss": 1.939, "num_input_tokens_seen": 102740480, "step": 100900 }, { "epoch": 1.9885412769978932, "grad_norm": 2.338625431060791, "learning_rate": 9.572667133292849e-07, "loss": 1.9403, "num_input_tokens_seen": 102842336, "step": 101000 }, { "epoch": 1.9905101297473962, "grad_norm": 1.8449496030807495, "learning_rate": 9.571832723176967e-07, "loss": 2.0347, "num_input_tokens_seen": 102942616, "step": 101100 }, { "epoch": 1.9924789824968991, "grad_norm": 1.9634342193603516, "learning_rate": 9.57099753565949e-07, "loss": 1.9701, "num_input_tokens_seen": 103044392, "step": 101200 }, { "epoch": 1.994447835246402, "grad_norm": 1.7099233865737915, "learning_rate": 9.570161570882436e-07, "loss": 1.9833, "num_input_tokens_seen": 103146792, "step": 101300 }, { "epoch": 1.9964166879959047, "grad_norm": 2.008478879928589, "learning_rate": 9.569324828987955e-07, "loss": 1.9858, "num_input_tokens_seen": 103247832, "step": 101400 }, { "epoch": 1.9983855407454076, "grad_norm": 1.8442299365997314, "learning_rate": 9.568487310118324e-07, "loss": 1.9665, "num_input_tokens_seen": 103348552, "step": 101500 }, { "epoch": 2.0003543934949106, "grad_norm": 1.7430541515350342, "learning_rate": 9.56764901441596e-07, "loss": 1.9641, "num_input_tokens_seen": 103450656, "step": 101600 }, { "epoch": 2.0023232462444134, "grad_norm": 2.0603220462799072, "learning_rate": 9.566809942023405e-07, "loss": 1.9251, "num_input_tokens_seen": 103553056, "step": 101700 }, { "epoch": 2.004292098993916, "grad_norm": 2.089635133743286, "learning_rate": 9.56597009308334e-07, "loss": 2.0007, "num_input_tokens_seen": 103654728, "step": 101800 }, { "epoch": 2.006260951743419, "grad_norm": 1.8020271062850952, "learning_rate": 9.56512946773857e-07, "loss": 1.9986, "num_input_tokens_seen": 103756528, "step": 101900 }, { "epoch": 2.008229804492922, "grad_norm": 1.8193224668502808, "learning_rate": 9.564288066132037e-07, "loss": 1.9309, "num_input_tokens_seen": 103858584, "step": 102000 }, { "epoch": 2.010198657242425, "grad_norm": 2.004086971282959, "learning_rate": 9.563445888406815e-07, "loss": 1.9526, "num_input_tokens_seen": 103960984, "step": 102100 }, { "epoch": 2.0121675099919276, "grad_norm": 1.7387619018554688, "learning_rate": 9.562602934706112e-07, "loss": 2.0099, "num_input_tokens_seen": 104062488, "step": 102200 }, { "epoch": 2.0141363627414304, "grad_norm": 1.8126006126403809, "learning_rate": 9.561759205173264e-07, "loss": 2.0142, "num_input_tokens_seen": 104163696, "step": 102300 }, { "epoch": 2.0161052154909336, "grad_norm": 2.117628335952759, "learning_rate": 9.560914699951736e-07, "loss": 1.9843, "num_input_tokens_seen": 104266096, "step": 102400 }, { "epoch": 2.0180740682404363, "grad_norm": 1.9752012491226196, "learning_rate": 9.560069419185131e-07, "loss": 1.9188, "num_input_tokens_seen": 104368496, "step": 102500 }, { "epoch": 2.020042920989939, "grad_norm": 1.9322516918182373, "learning_rate": 9.559223363017184e-07, "loss": 1.9917, "num_input_tokens_seen": 104467432, "step": 102600 }, { "epoch": 2.022011773739442, "grad_norm": 1.8220409154891968, "learning_rate": 9.558376531591758e-07, "loss": 2.0283, "num_input_tokens_seen": 104569368, "step": 102700 }, { "epoch": 2.023980626488945, "grad_norm": 2.0056328773498535, "learning_rate": 9.55752892505285e-07, "loss": 2.0213, "num_input_tokens_seen": 104669888, "step": 102800 }, { "epoch": 2.025949479238448, "grad_norm": 1.9241981506347656, "learning_rate": 9.556680543544591e-07, "loss": 2.0665, "num_input_tokens_seen": 104771376, "step": 102900 }, { "epoch": 2.0279183319879506, "grad_norm": 1.925260305404663, "learning_rate": 9.555831387211234e-07, "loss": 1.9477, "num_input_tokens_seen": 104873776, "step": 103000 }, { "epoch": 2.0298871847374533, "grad_norm": 1.9352415800094604, "learning_rate": 9.55498145619718e-07, "loss": 2.0178, "num_input_tokens_seen": 104976176, "step": 103100 }, { "epoch": 2.0318560374869565, "grad_norm": 1.9706671237945557, "learning_rate": 9.554130750646946e-07, "loss": 1.9767, "num_input_tokens_seen": 105078032, "step": 103200 }, { "epoch": 2.0338248902364593, "grad_norm": 1.8645918369293213, "learning_rate": 9.553279270705186e-07, "loss": 1.9914, "num_input_tokens_seen": 105180432, "step": 103300 }, { "epoch": 2.035793742985962, "grad_norm": 1.78374183177948, "learning_rate": 9.552427016516692e-07, "loss": 1.9623, "num_input_tokens_seen": 105282832, "step": 103400 }, { "epoch": 2.037762595735465, "grad_norm": 2.105135202407837, "learning_rate": 9.551573988226382e-07, "loss": 1.9699, "num_input_tokens_seen": 105385232, "step": 103500 }, { "epoch": 2.039731448484968, "grad_norm": 2.1146798133850098, "learning_rate": 9.550720185979306e-07, "loss": 2.0081, "num_input_tokens_seen": 105487632, "step": 103600 }, { "epoch": 2.0417003012344708, "grad_norm": 1.8652223348617554, "learning_rate": 9.549865609920642e-07, "loss": 2.0589, "num_input_tokens_seen": 105588200, "step": 103700 }, { "epoch": 2.0436691539839735, "grad_norm": 1.7666460275650024, "learning_rate": 9.54901026019571e-07, "loss": 1.9854, "num_input_tokens_seen": 105689824, "step": 103800 }, { "epoch": 2.0456380067334763, "grad_norm": 2.112034559249878, "learning_rate": 9.548154136949948e-07, "loss": 1.9903, "num_input_tokens_seen": 105791384, "step": 103900 }, { "epoch": 2.047606859482979, "grad_norm": 2.0934624671936035, "learning_rate": 9.547297240328937e-07, "loss": 2.0259, "num_input_tokens_seen": 105893784, "step": 104000 }, { "epoch": 2.0495757122324822, "grad_norm": 2.0104362964630127, "learning_rate": 9.546439570478383e-07, "loss": 1.9472, "num_input_tokens_seen": 105996184, "step": 104100 }, { "epoch": 2.051544564981985, "grad_norm": 1.9536254405975342, "learning_rate": 9.545581127544129e-07, "loss": 2.0228, "num_input_tokens_seen": 106097432, "step": 104200 }, { "epoch": 2.0535134177314878, "grad_norm": 1.8959099054336548, "learning_rate": 9.54472191167214e-07, "loss": 1.9946, "num_input_tokens_seen": 106197552, "step": 104300 }, { "epoch": 2.0554822704809905, "grad_norm": 1.8936536312103271, "learning_rate": 9.543861923008526e-07, "loss": 1.9822, "num_input_tokens_seen": 106299952, "step": 104400 }, { "epoch": 2.0574511232304937, "grad_norm": 1.8303178548812866, "learning_rate": 9.543001161699514e-07, "loss": 1.9786, "num_input_tokens_seen": 106401848, "step": 104500 }, { "epoch": 2.0594199759799965, "grad_norm": 1.9874995946884155, "learning_rate": 9.54213962789147e-07, "loss": 2.0032, "num_input_tokens_seen": 106502624, "step": 104600 }, { "epoch": 2.0613888287294992, "grad_norm": 1.8439503908157349, "learning_rate": 9.541277321730896e-07, "loss": 2.0316, "num_input_tokens_seen": 106604280, "step": 104700 }, { "epoch": 2.063357681479002, "grad_norm": 1.7990156412124634, "learning_rate": 9.540414243364415e-07, "loss": 1.9838, "num_input_tokens_seen": 106705936, "step": 104800 }, { "epoch": 2.065326534228505, "grad_norm": 2.1370999813079834, "learning_rate": 9.539550392938787e-07, "loss": 1.9563, "num_input_tokens_seen": 106808336, "step": 104900 }, { "epoch": 2.067295386978008, "grad_norm": 1.9992778301239014, "learning_rate": 9.538685770600906e-07, "loss": 2.0396, "num_input_tokens_seen": 106910296, "step": 105000 }, { "epoch": 2.0692642397275107, "grad_norm": 1.9784587621688843, "learning_rate": 9.537820376497787e-07, "loss": 1.9999, "num_input_tokens_seen": 107012696, "step": 105100 }, { "epoch": 2.0712330924770135, "grad_norm": 1.8854153156280518, "learning_rate": 9.536954210776588e-07, "loss": 1.9577, "num_input_tokens_seen": 107115096, "step": 105200 }, { "epoch": 2.0732019452265167, "grad_norm": 1.8609813451766968, "learning_rate": 9.536087273584592e-07, "loss": 2.0351, "num_input_tokens_seen": 107215840, "step": 105300 }, { "epoch": 2.0751707979760194, "grad_norm": 1.9522591829299927, "learning_rate": 9.535219565069215e-07, "loss": 1.9819, "num_input_tokens_seen": 107317464, "step": 105400 }, { "epoch": 2.077139650725522, "grad_norm": 2.1145567893981934, "learning_rate": 9.534351085378001e-07, "loss": 2.0, "num_input_tokens_seen": 107419056, "step": 105500 }, { "epoch": 2.079108503475025, "grad_norm": 2.0271804332733154, "learning_rate": 9.533481834658631e-07, "loss": 1.953, "num_input_tokens_seen": 107521456, "step": 105600 }, { "epoch": 2.081077356224528, "grad_norm": 1.906770944595337, "learning_rate": 9.532611813058912e-07, "loss": 1.9844, "num_input_tokens_seen": 107623856, "step": 105700 }, { "epoch": 2.083046208974031, "grad_norm": 2.354865074157715, "learning_rate": 9.531741020726783e-07, "loss": 2.0101, "num_input_tokens_seen": 107726256, "step": 105800 }, { "epoch": 2.0850150617235337, "grad_norm": 2.017036199569702, "learning_rate": 9.530869457810316e-07, "loss": 2.0208, "num_input_tokens_seen": 107827480, "step": 105900 }, { "epoch": 2.0869839144730364, "grad_norm": 2.106940269470215, "learning_rate": 9.529997124457713e-07, "loss": 1.9645, "num_input_tokens_seen": 107929880, "step": 106000 }, { "epoch": 2.0889527672225396, "grad_norm": 1.7717925310134888, "learning_rate": 9.529124020817308e-07, "loss": 1.9071, "num_input_tokens_seen": 108032280, "step": 106100 }, { "epoch": 2.0909216199720424, "grad_norm": 1.8631401062011719, "learning_rate": 9.528250147037563e-07, "loss": 2.0584, "num_input_tokens_seen": 108134680, "step": 106200 }, { "epoch": 2.092890472721545, "grad_norm": 1.8711248636245728, "learning_rate": 9.527375503267073e-07, "loss": 1.9705, "num_input_tokens_seen": 108236464, "step": 106300 }, { "epoch": 2.094859325471048, "grad_norm": 1.8259410858154297, "learning_rate": 9.526500089654565e-07, "loss": 1.9693, "num_input_tokens_seen": 108338864, "step": 106400 }, { "epoch": 2.0968281782205507, "grad_norm": 2.248173713684082, "learning_rate": 9.525623906348895e-07, "loss": 1.9823, "num_input_tokens_seen": 108440848, "step": 106500 }, { "epoch": 2.098797030970054, "grad_norm": 1.7875350713729858, "learning_rate": 9.52474695349905e-07, "loss": 2.0181, "num_input_tokens_seen": 108542304, "step": 106600 }, { "epoch": 2.1007658837195566, "grad_norm": 2.09037709236145, "learning_rate": 9.52386923125415e-07, "loss": 2.0001, "num_input_tokens_seen": 108644704, "step": 106700 }, { "epoch": 2.1027347364690594, "grad_norm": 1.9783692359924316, "learning_rate": 9.522990739763442e-07, "loss": 1.9933, "num_input_tokens_seen": 108747104, "step": 106800 }, { "epoch": 2.104703589218562, "grad_norm": 2.1536645889282227, "learning_rate": 9.522111479176307e-07, "loss": 2.0328, "num_input_tokens_seen": 108849504, "step": 106900 }, { "epoch": 2.1066724419680654, "grad_norm": 1.737360954284668, "learning_rate": 9.521231449642257e-07, "loss": 1.9932, "num_input_tokens_seen": 108950008, "step": 107000 }, { "epoch": 2.108641294717568, "grad_norm": 1.7101801633834839, "learning_rate": 9.520350651310934e-07, "loss": 1.9547, "num_input_tokens_seen": 109052408, "step": 107100 }, { "epoch": 2.110610147467071, "grad_norm": 1.784883737564087, "learning_rate": 9.519469084332108e-07, "loss": 1.9893, "num_input_tokens_seen": 109154376, "step": 107200 }, { "epoch": 2.1125790002165736, "grad_norm": 2.0286502838134766, "learning_rate": 9.518586748855683e-07, "loss": 1.9651, "num_input_tokens_seen": 109256208, "step": 107300 }, { "epoch": 2.114547852966077, "grad_norm": 1.8560724258422852, "learning_rate": 9.517703645031693e-07, "loss": 1.965, "num_input_tokens_seen": 109358608, "step": 107400 }, { "epoch": 2.1165167057155796, "grad_norm": 1.9718207120895386, "learning_rate": 9.516819773010303e-07, "loss": 2.0604, "num_input_tokens_seen": 109459968, "step": 107500 }, { "epoch": 2.1184855584650824, "grad_norm": 1.9123685359954834, "learning_rate": 9.515935132941806e-07, "loss": 1.9922, "num_input_tokens_seen": 109561296, "step": 107600 }, { "epoch": 2.120454411214585, "grad_norm": 1.8966621160507202, "learning_rate": 9.515049724976629e-07, "loss": 1.9399, "num_input_tokens_seen": 109663696, "step": 107700 }, { "epoch": 2.1224232639640883, "grad_norm": 1.7923712730407715, "learning_rate": 9.514163549265329e-07, "loss": 1.971, "num_input_tokens_seen": 109765912, "step": 107800 }, { "epoch": 2.124392116713591, "grad_norm": 2.306278944015503, "learning_rate": 9.513276605958592e-07, "loss": 1.9789, "num_input_tokens_seen": 109868312, "step": 107900 }, { "epoch": 2.126360969463094, "grad_norm": 1.8584436178207397, "learning_rate": 9.512388895207233e-07, "loss": 1.9907, "num_input_tokens_seen": 109970712, "step": 108000 }, { "epoch": 2.1283298222125966, "grad_norm": 2.0306830406188965, "learning_rate": 9.511500417162203e-07, "loss": 1.9896, "num_input_tokens_seen": 110071440, "step": 108100 }, { "epoch": 2.1302986749621, "grad_norm": 1.6810091733932495, "learning_rate": 9.510611171974579e-07, "loss": 2.0375, "num_input_tokens_seen": 110171600, "step": 108200 }, { "epoch": 2.1322675277116026, "grad_norm": 4.35719633102417, "learning_rate": 9.509721159795567e-07, "loss": 1.9479, "num_input_tokens_seen": 110272832, "step": 108300 }, { "epoch": 2.1342363804611053, "grad_norm": 2.484416961669922, "learning_rate": 9.508830380776512e-07, "loss": 2.0175, "num_input_tokens_seen": 110375232, "step": 108400 }, { "epoch": 2.136205233210608, "grad_norm": 1.8578368425369263, "learning_rate": 9.507938835068877e-07, "loss": 1.9634, "num_input_tokens_seen": 110477632, "step": 108500 }, { "epoch": 2.1381740859601113, "grad_norm": 1.989546537399292, "learning_rate": 9.507046522824267e-07, "loss": 2.0112, "num_input_tokens_seen": 110579904, "step": 108600 }, { "epoch": 2.140142938709614, "grad_norm": 2.319326162338257, "learning_rate": 9.506153444194409e-07, "loss": 1.9663, "num_input_tokens_seen": 110682304, "step": 108700 }, { "epoch": 2.142111791459117, "grad_norm": 1.910566806793213, "learning_rate": 9.505259599331164e-07, "loss": 1.9984, "num_input_tokens_seen": 110784704, "step": 108800 }, { "epoch": 2.1440806442086195, "grad_norm": 2.196389675140381, "learning_rate": 9.504364988386524e-07, "loss": 2.0161, "num_input_tokens_seen": 110885160, "step": 108900 }, { "epoch": 2.1460494969581223, "grad_norm": 2.400207757949829, "learning_rate": 9.503469611512609e-07, "loss": 2.0052, "num_input_tokens_seen": 110986720, "step": 109000 }, { "epoch": 2.1480183497076255, "grad_norm": 1.8510814905166626, "learning_rate": 9.50257346886167e-07, "loss": 2.0, "num_input_tokens_seen": 111087864, "step": 109100 }, { "epoch": 2.1499872024571283, "grad_norm": 2.4665424823760986, "learning_rate": 9.501676560586091e-07, "loss": 1.9725, "num_input_tokens_seen": 111188912, "step": 109200 }, { "epoch": 2.151956055206631, "grad_norm": 1.8784955739974976, "learning_rate": 9.50077888683838e-07, "loss": 1.9766, "num_input_tokens_seen": 111290312, "step": 109300 }, { "epoch": 2.153924907956134, "grad_norm": 2.3580844402313232, "learning_rate": 9.49988044777118e-07, "loss": 1.9844, "num_input_tokens_seen": 111392712, "step": 109400 }, { "epoch": 2.155893760705637, "grad_norm": 2.142921209335327, "learning_rate": 9.498981243537265e-07, "loss": 2.008, "num_input_tokens_seen": 111492632, "step": 109500 }, { "epoch": 2.1578626134551397, "grad_norm": 2.0246009826660156, "learning_rate": 9.498081274289535e-07, "loss": 1.9994, "num_input_tokens_seen": 111594104, "step": 109600 }, { "epoch": 2.1598314662046425, "grad_norm": 2.277479887008667, "learning_rate": 9.497180540181024e-07, "loss": 1.9804, "num_input_tokens_seen": 111695128, "step": 109700 }, { "epoch": 2.1618003189541453, "grad_norm": 1.8636360168457031, "learning_rate": 9.496279041364893e-07, "loss": 1.954, "num_input_tokens_seen": 111796760, "step": 109800 }, { "epoch": 2.1637691717036485, "grad_norm": 1.7724113464355469, "learning_rate": 9.495376777994434e-07, "loss": 1.9674, "num_input_tokens_seen": 111899160, "step": 109900 }, { "epoch": 2.1657380244531512, "grad_norm": 2.1405045986175537, "learning_rate": 9.494473750223069e-07, "loss": 2.0086, "num_input_tokens_seen": 112000976, "step": 110000 }, { "epoch": 2.167706877202654, "grad_norm": 1.9437774419784546, "learning_rate": 9.493569958204352e-07, "loss": 1.9913, "num_input_tokens_seen": 112103376, "step": 110100 }, { "epoch": 2.1696757299521567, "grad_norm": 1.9000684022903442, "learning_rate": 9.492665402091964e-07, "loss": 1.9962, "num_input_tokens_seen": 112205776, "step": 110200 }, { "epoch": 2.17164458270166, "grad_norm": 2.1248345375061035, "learning_rate": 9.491760082039716e-07, "loss": 1.9823, "num_input_tokens_seen": 112307224, "step": 110300 }, { "epoch": 2.1736134354511627, "grad_norm": 2.0565896034240723, "learning_rate": 9.490853998201551e-07, "loss": 1.9862, "num_input_tokens_seen": 112409624, "step": 110400 }, { "epoch": 2.1755822882006655, "grad_norm": 2.0068960189819336, "learning_rate": 9.489947150731544e-07, "loss": 1.9418, "num_input_tokens_seen": 112512024, "step": 110500 }, { "epoch": 2.177551140950168, "grad_norm": 1.938106894493103, "learning_rate": 9.489039539783892e-07, "loss": 1.9478, "num_input_tokens_seen": 112613608, "step": 110600 }, { "epoch": 2.179519993699671, "grad_norm": 1.8697112798690796, "learning_rate": 9.488131165512928e-07, "loss": 1.9644, "num_input_tokens_seen": 112715192, "step": 110700 }, { "epoch": 2.181488846449174, "grad_norm": 2.2095909118652344, "learning_rate": 9.487222028073114e-07, "loss": 1.9624, "num_input_tokens_seen": 112817592, "step": 110800 }, { "epoch": 2.183457699198677, "grad_norm": 1.6604583263397217, "learning_rate": 9.486312127619043e-07, "loss": 1.9457, "num_input_tokens_seen": 112919992, "step": 110900 }, { "epoch": 2.1854265519481797, "grad_norm": 1.733359694480896, "learning_rate": 9.485401464305433e-07, "loss": 1.9655, "num_input_tokens_seen": 113022392, "step": 111000 }, { "epoch": 2.1873954046976825, "grad_norm": 1.8730971813201904, "learning_rate": 9.484490038287135e-07, "loss": 1.9608, "num_input_tokens_seen": 113124792, "step": 111100 }, { "epoch": 2.1893642574471857, "grad_norm": 2.102570056915283, "learning_rate": 9.483577849719131e-07, "loss": 1.9875, "num_input_tokens_seen": 113226256, "step": 111200 }, { "epoch": 2.1913331101966884, "grad_norm": 1.8639987707138062, "learning_rate": 9.482664898756529e-07, "loss": 2.0006, "num_input_tokens_seen": 113327992, "step": 111300 }, { "epoch": 2.193301962946191, "grad_norm": 1.9622057676315308, "learning_rate": 9.481751185554569e-07, "loss": 1.9812, "num_input_tokens_seen": 113429528, "step": 111400 }, { "epoch": 2.195270815695694, "grad_norm": 1.8260507583618164, "learning_rate": 9.480836710268623e-07, "loss": 2.0086, "num_input_tokens_seen": 113531232, "step": 111500 }, { "epoch": 2.197239668445197, "grad_norm": 1.7977197170257568, "learning_rate": 9.479921473054185e-07, "loss": 2.004, "num_input_tokens_seen": 113633632, "step": 111600 }, { "epoch": 2.1992085211947, "grad_norm": 1.9349136352539062, "learning_rate": 9.479005474066888e-07, "loss": 1.9878, "num_input_tokens_seen": 113736032, "step": 111700 }, { "epoch": 2.2011773739442027, "grad_norm": 1.8296550512313843, "learning_rate": 9.478088713462487e-07, "loss": 1.9464, "num_input_tokens_seen": 113838432, "step": 111800 }, { "epoch": 2.2031462266937054, "grad_norm": 2.0176846981048584, "learning_rate": 9.477171191396868e-07, "loss": 1.941, "num_input_tokens_seen": 113940256, "step": 111900 }, { "epoch": 2.2051150794432086, "grad_norm": 1.746730089187622, "learning_rate": 9.476252908026054e-07, "loss": 2.0294, "num_input_tokens_seen": 114042344, "step": 112000 }, { "epoch": 2.2070839321927114, "grad_norm": 2.1340911388397217, "learning_rate": 9.475333863506185e-07, "loss": 1.9986, "num_input_tokens_seen": 114144304, "step": 112100 }, { "epoch": 2.209052784942214, "grad_norm": 1.737505316734314, "learning_rate": 9.474414057993541e-07, "loss": 1.9797, "num_input_tokens_seen": 114246184, "step": 112200 }, { "epoch": 2.211021637691717, "grad_norm": 1.732858657836914, "learning_rate": 9.473493491644523e-07, "loss": 1.9783, "num_input_tokens_seen": 114347368, "step": 112300 }, { "epoch": 2.21299049044122, "grad_norm": 1.7610417604446411, "learning_rate": 9.472572164615668e-07, "loss": 1.9963, "num_input_tokens_seen": 114449064, "step": 112400 }, { "epoch": 2.214959343190723, "grad_norm": 1.9439780712127686, "learning_rate": 9.471650077063642e-07, "loss": 2.0007, "num_input_tokens_seen": 114550840, "step": 112500 }, { "epoch": 2.2169281959402256, "grad_norm": 2.0145537853240967, "learning_rate": 9.470727229145235e-07, "loss": 2.0029, "num_input_tokens_seen": 114653240, "step": 112600 }, { "epoch": 2.2188970486897284, "grad_norm": 2.5199801921844482, "learning_rate": 9.469803621017371e-07, "loss": 1.9654, "num_input_tokens_seen": 114755120, "step": 112700 }, { "epoch": 2.2208659014392316, "grad_norm": 1.873188853263855, "learning_rate": 9.468879252837101e-07, "loss": 1.9523, "num_input_tokens_seen": 114856672, "step": 112800 }, { "epoch": 2.2228347541887343, "grad_norm": 1.9388569593429565, "learning_rate": 9.467954124761606e-07, "loss": 2.0349, "num_input_tokens_seen": 114958008, "step": 112900 }, { "epoch": 2.224803606938237, "grad_norm": 2.1790852546691895, "learning_rate": 9.467028236948197e-07, "loss": 1.9762, "num_input_tokens_seen": 115060408, "step": 113000 }, { "epoch": 2.22677245968774, "grad_norm": 2.057921886444092, "learning_rate": 9.466101589554312e-07, "loss": 1.9801, "num_input_tokens_seen": 115161952, "step": 113100 }, { "epoch": 2.2287413124372426, "grad_norm": 1.8343178033828735, "learning_rate": 9.465174182737521e-07, "loss": 1.9496, "num_input_tokens_seen": 115264352, "step": 113200 }, { "epoch": 2.230710165186746, "grad_norm": 1.61873459815979, "learning_rate": 9.464246016655522e-07, "loss": 1.9687, "num_input_tokens_seen": 115366752, "step": 113300 }, { "epoch": 2.2326790179362486, "grad_norm": 1.95689058303833, "learning_rate": 9.463317091466141e-07, "loss": 2.0141, "num_input_tokens_seen": 115469152, "step": 113400 }, { "epoch": 2.2346478706857513, "grad_norm": 1.8117475509643555, "learning_rate": 9.462387407327332e-07, "loss": 1.9562, "num_input_tokens_seen": 115571552, "step": 113500 }, { "epoch": 2.236616723435254, "grad_norm": 1.9498783349990845, "learning_rate": 9.461456964397184e-07, "loss": 1.9446, "num_input_tokens_seen": 115673952, "step": 113600 }, { "epoch": 2.2385855761847573, "grad_norm": 1.789787769317627, "learning_rate": 9.460525762833908e-07, "loss": 2.0464, "num_input_tokens_seen": 115775504, "step": 113700 }, { "epoch": 2.24055442893426, "grad_norm": 3.6420681476593018, "learning_rate": 9.459593802795849e-07, "loss": 1.9711, "num_input_tokens_seen": 115877296, "step": 113800 }, { "epoch": 2.242523281683763, "grad_norm": 1.8349014520645142, "learning_rate": 9.458661084441478e-07, "loss": 1.9621, "num_input_tokens_seen": 115979200, "step": 113900 }, { "epoch": 2.2444921344332656, "grad_norm": 1.928267478942871, "learning_rate": 9.457727607929393e-07, "loss": 2.015, "num_input_tokens_seen": 116081096, "step": 114000 }, { "epoch": 2.2464609871827688, "grad_norm": 2.3008081912994385, "learning_rate": 9.456793373418328e-07, "loss": 1.9905, "num_input_tokens_seen": 116182640, "step": 114100 }, { "epoch": 2.2484298399322715, "grad_norm": 2.4185616970062256, "learning_rate": 9.455858381067141e-07, "loss": 1.9803, "num_input_tokens_seen": 116285040, "step": 114200 }, { "epoch": 2.2503986926817743, "grad_norm": 2.0490100383758545, "learning_rate": 9.454922631034818e-07, "loss": 2.0164, "num_input_tokens_seen": 116387440, "step": 114300 }, { "epoch": 2.252367545431277, "grad_norm": 1.9811676740646362, "learning_rate": 9.453986123480476e-07, "loss": 2.0168, "num_input_tokens_seen": 116489184, "step": 114400 }, { "epoch": 2.2543363981807802, "grad_norm": 2.4179511070251465, "learning_rate": 9.453048858563359e-07, "loss": 2.0379, "num_input_tokens_seen": 116591584, "step": 114500 }, { "epoch": 2.256305250930283, "grad_norm": 1.692130208015442, "learning_rate": 9.452110836442844e-07, "loss": 1.9564, "num_input_tokens_seen": 116693984, "step": 114600 }, { "epoch": 2.2582741036797858, "grad_norm": 1.9258095026016235, "learning_rate": 9.451172057278432e-07, "loss": 1.9786, "num_input_tokens_seen": 116795520, "step": 114700 }, { "epoch": 2.2602429564292885, "grad_norm": 2.063319683074951, "learning_rate": 9.450232521229755e-07, "loss": 1.9775, "num_input_tokens_seen": 116897920, "step": 114800 }, { "epoch": 2.2622118091787913, "grad_norm": 1.850327730178833, "learning_rate": 9.449292228456572e-07, "loss": 1.957, "num_input_tokens_seen": 116999192, "step": 114900 }, { "epoch": 2.2641806619282945, "grad_norm": 2.2068047523498535, "learning_rate": 9.448351179118773e-07, "loss": 1.9599, "num_input_tokens_seen": 117100776, "step": 115000 }, { "epoch": 2.2661495146777972, "grad_norm": 1.7473108768463135, "learning_rate": 9.447409373376375e-07, "loss": 2.0059, "num_input_tokens_seen": 117202496, "step": 115100 }, { "epoch": 2.2681183674273, "grad_norm": 1.8753230571746826, "learning_rate": 9.446466811389524e-07, "loss": 1.9853, "num_input_tokens_seen": 117304136, "step": 115200 }, { "epoch": 2.270087220176803, "grad_norm": 1.7122375965118408, "learning_rate": 9.445523493318496e-07, "loss": 1.9855, "num_input_tokens_seen": 117406088, "step": 115300 }, { "epoch": 2.272056072926306, "grad_norm": 1.9982937574386597, "learning_rate": 9.444579419323692e-07, "loss": 1.9693, "num_input_tokens_seen": 117508488, "step": 115400 }, { "epoch": 2.2740249256758087, "grad_norm": 1.8192331790924072, "learning_rate": 9.443634589565646e-07, "loss": 2.0152, "num_input_tokens_seen": 117610112, "step": 115500 }, { "epoch": 2.2759937784253115, "grad_norm": 1.9701948165893555, "learning_rate": 9.442689004205015e-07, "loss": 1.9559, "num_input_tokens_seen": 117712512, "step": 115600 }, { "epoch": 2.2779626311748142, "grad_norm": 1.8190864324569702, "learning_rate": 9.441742663402593e-07, "loss": 1.9861, "num_input_tokens_seen": 117813136, "step": 115700 }, { "epoch": 2.2799314839243174, "grad_norm": 2.1761672496795654, "learning_rate": 9.440795567319294e-07, "loss": 1.966, "num_input_tokens_seen": 117913968, "step": 115800 }, { "epoch": 2.28190033667382, "grad_norm": 1.8368778228759766, "learning_rate": 9.439847716116162e-07, "loss": 1.9665, "num_input_tokens_seen": 118013648, "step": 115900 }, { "epoch": 2.283869189423323, "grad_norm": 2.028080940246582, "learning_rate": 9.438899109954377e-07, "loss": 1.99, "num_input_tokens_seen": 118115552, "step": 116000 }, { "epoch": 2.2858380421728257, "grad_norm": 2.0657784938812256, "learning_rate": 9.437949748995235e-07, "loss": 1.9954, "num_input_tokens_seen": 118215696, "step": 116100 }, { "epoch": 2.287806894922329, "grad_norm": 2.0885541439056396, "learning_rate": 9.436999633400171e-07, "loss": 2.0151, "num_input_tokens_seen": 118316944, "step": 116200 }, { "epoch": 2.2897757476718317, "grad_norm": 1.8826969861984253, "learning_rate": 9.436048763330742e-07, "loss": 1.9737, "num_input_tokens_seen": 118418792, "step": 116300 }, { "epoch": 2.2917446004213344, "grad_norm": 1.8971164226531982, "learning_rate": 9.435097138948635e-07, "loss": 2.0081, "num_input_tokens_seen": 118520376, "step": 116400 }, { "epoch": 2.293713453170837, "grad_norm": 1.8199630975723267, "learning_rate": 9.434144760415669e-07, "loss": 1.9929, "num_input_tokens_seen": 118622776, "step": 116500 }, { "epoch": 2.2956823059203404, "grad_norm": 2.096458911895752, "learning_rate": 9.433191627893784e-07, "loss": 1.9575, "num_input_tokens_seen": 118723912, "step": 116600 }, { "epoch": 2.297651158669843, "grad_norm": 1.834140419960022, "learning_rate": 9.432237741545055e-07, "loss": 2.0431, "num_input_tokens_seen": 118825456, "step": 116700 }, { "epoch": 2.299620011419346, "grad_norm": 1.9655934572219849, "learning_rate": 9.431283101531681e-07, "loss": 1.9622, "num_input_tokens_seen": 118927856, "step": 116800 }, { "epoch": 2.3015888641688487, "grad_norm": 1.634841799736023, "learning_rate": 9.43032770801599e-07, "loss": 1.9598, "num_input_tokens_seen": 119030256, "step": 116900 }, { "epoch": 2.303557716918352, "grad_norm": 1.7568492889404297, "learning_rate": 9.429371561160442e-07, "loss": 2.0286, "num_input_tokens_seen": 119131464, "step": 117000 }, { "epoch": 2.3055265696678546, "grad_norm": 1.7758232355117798, "learning_rate": 9.428414661127618e-07, "loss": 1.9796, "num_input_tokens_seen": 119232936, "step": 117100 }, { "epoch": 2.3074954224173574, "grad_norm": 1.8784618377685547, "learning_rate": 9.427457008080229e-07, "loss": 1.9914, "num_input_tokens_seen": 119335040, "step": 117200 }, { "epoch": 2.30946427516686, "grad_norm": 1.991125464439392, "learning_rate": 9.42649860218112e-07, "loss": 2.0175, "num_input_tokens_seen": 119436864, "step": 117300 }, { "epoch": 2.311433127916363, "grad_norm": 1.7689714431762695, "learning_rate": 9.425539443593261e-07, "loss": 1.985, "num_input_tokens_seen": 119539264, "step": 117400 }, { "epoch": 2.313401980665866, "grad_norm": 1.9021596908569336, "learning_rate": 9.424579532479744e-07, "loss": 2.0146, "num_input_tokens_seen": 119641024, "step": 117500 }, { "epoch": 2.315370833415369, "grad_norm": 1.9273254871368408, "learning_rate": 9.423618869003798e-07, "loss": 2.0019, "num_input_tokens_seen": 119742472, "step": 117600 }, { "epoch": 2.3173396861648716, "grad_norm": 2.1978797912597656, "learning_rate": 9.422657453328772e-07, "loss": 1.9974, "num_input_tokens_seen": 119844872, "step": 117700 }, { "epoch": 2.319308538914375, "grad_norm": 1.8331992626190186, "learning_rate": 9.42169528561815e-07, "loss": 2.017, "num_input_tokens_seen": 119946560, "step": 117800 }, { "epoch": 2.3212773916638776, "grad_norm": 1.8678513765335083, "learning_rate": 9.420732366035538e-07, "loss": 1.9745, "num_input_tokens_seen": 120048392, "step": 117900 }, { "epoch": 2.3232462444133803, "grad_norm": 1.968079924583435, "learning_rate": 9.419768694744672e-07, "loss": 1.9955, "num_input_tokens_seen": 120149472, "step": 118000 }, { "epoch": 2.325215097162883, "grad_norm": 1.772836446762085, "learning_rate": 9.41880427190942e-07, "loss": 1.9925, "num_input_tokens_seen": 120251872, "step": 118100 }, { "epoch": 2.327183949912386, "grad_norm": 1.8192387819290161, "learning_rate": 9.41783909769377e-07, "loss": 2.022, "num_input_tokens_seen": 120354272, "step": 118200 }, { "epoch": 2.329152802661889, "grad_norm": 2.0357606410980225, "learning_rate": 9.416873172261843e-07, "loss": 2.015, "num_input_tokens_seen": 120456672, "step": 118300 }, { "epoch": 2.331121655411392, "grad_norm": 1.712955355644226, "learning_rate": 9.415906495777888e-07, "loss": 1.9741, "num_input_tokens_seen": 120558152, "step": 118400 }, { "epoch": 2.3330905081608946, "grad_norm": 1.790560007095337, "learning_rate": 9.414939068406278e-07, "loss": 1.9844, "num_input_tokens_seen": 120659776, "step": 118500 }, { "epoch": 2.3350593609103973, "grad_norm": 1.6297513246536255, "learning_rate": 9.413970890311517e-07, "loss": 1.9849, "num_input_tokens_seen": 120761304, "step": 118600 }, { "epoch": 2.3370282136599005, "grad_norm": 3.2493929862976074, "learning_rate": 9.413001961658235e-07, "loss": 2.046, "num_input_tokens_seen": 120863704, "step": 118700 }, { "epoch": 2.3389970664094033, "grad_norm": 1.8684008121490479, "learning_rate": 9.412032282611191e-07, "loss": 1.9968, "num_input_tokens_seen": 120965288, "step": 118800 }, { "epoch": 2.340965919158906, "grad_norm": 1.9268240928649902, "learning_rate": 9.411061853335269e-07, "loss": 2.0215, "num_input_tokens_seen": 121067240, "step": 118900 }, { "epoch": 2.342934771908409, "grad_norm": 2.0446953773498535, "learning_rate": 9.410090673995483e-07, "loss": 1.9718, "num_input_tokens_seen": 121167992, "step": 119000 }, { "epoch": 2.344903624657912, "grad_norm": 2.1195380687713623, "learning_rate": 9.409118744756977e-07, "loss": 1.9698, "num_input_tokens_seen": 121268824, "step": 119100 }, { "epoch": 2.346872477407415, "grad_norm": 2.0330278873443604, "learning_rate": 9.408146065785014e-07, "loss": 2.0111, "num_input_tokens_seen": 121370912, "step": 119200 }, { "epoch": 2.3488413301569175, "grad_norm": 2.2201755046844482, "learning_rate": 9.407172637244994e-07, "loss": 1.9716, "num_input_tokens_seen": 121472288, "step": 119300 }, { "epoch": 2.3508101829064203, "grad_norm": 1.8585072755813599, "learning_rate": 9.406198459302439e-07, "loss": 1.9807, "num_input_tokens_seen": 121574304, "step": 119400 }, { "epoch": 2.3527790356559235, "grad_norm": 1.8911980390548706, "learning_rate": 9.405223532122999e-07, "loss": 1.9585, "num_input_tokens_seen": 121676240, "step": 119500 }, { "epoch": 2.3547478884054263, "grad_norm": 2.043912172317505, "learning_rate": 9.404247855872452e-07, "loss": 1.9701, "num_input_tokens_seen": 121778016, "step": 119600 }, { "epoch": 2.356716741154929, "grad_norm": 1.8020490407943726, "learning_rate": 9.403271430716706e-07, "loss": 1.9727, "num_input_tokens_seen": 121880416, "step": 119700 }, { "epoch": 2.3586855939044318, "grad_norm": 2.0996904373168945, "learning_rate": 9.40229425682179e-07, "loss": 1.9843, "num_input_tokens_seen": 121982816, "step": 119800 }, { "epoch": 2.3606544466539345, "grad_norm": 1.9785363674163818, "learning_rate": 9.401316334353868e-07, "loss": 1.9667, "num_input_tokens_seen": 122084400, "step": 119900 }, { "epoch": 2.3626232994034377, "grad_norm": 1.6954538822174072, "learning_rate": 9.400337663479226e-07, "loss": 1.9978, "num_input_tokens_seen": 122186800, "step": 120000 }, { "epoch": 2.3645921521529405, "grad_norm": 2.209807872772217, "learning_rate": 9.399358244364277e-07, "loss": 2.0276, "num_input_tokens_seen": 122287616, "step": 120100 }, { "epoch": 2.3665610049024433, "grad_norm": 1.764782190322876, "learning_rate": 9.398378077175566e-07, "loss": 1.9761, "num_input_tokens_seen": 122388864, "step": 120200 }, { "epoch": 2.3685298576519465, "grad_norm": 1.9492393732070923, "learning_rate": 9.397397162079759e-07, "loss": 2.0153, "num_input_tokens_seen": 122491264, "step": 120300 }, { "epoch": 2.370498710401449, "grad_norm": 1.8193806409835815, "learning_rate": 9.396415499243655e-07, "loss": 1.9858, "num_input_tokens_seen": 122592256, "step": 120400 }, { "epoch": 2.372467563150952, "grad_norm": 1.8664618730545044, "learning_rate": 9.395433088834176e-07, "loss": 1.9605, "num_input_tokens_seen": 122693184, "step": 120500 }, { "epoch": 2.3744364159004547, "grad_norm": 1.7667503356933594, "learning_rate": 9.394449931018373e-07, "loss": 1.9666, "num_input_tokens_seen": 122795584, "step": 120600 }, { "epoch": 2.3764052686499575, "grad_norm": 1.727834939956665, "learning_rate": 9.393466025963423e-07, "loss": 1.9711, "num_input_tokens_seen": 122897248, "step": 120700 }, { "epoch": 2.3783741213994607, "grad_norm": 1.9298264980316162, "learning_rate": 9.392481373836633e-07, "loss": 1.959, "num_input_tokens_seen": 122999648, "step": 120800 }, { "epoch": 2.3803429741489635, "grad_norm": 2.3186757564544678, "learning_rate": 9.391495974805432e-07, "loss": 1.9725, "num_input_tokens_seen": 123101528, "step": 120900 }, { "epoch": 2.382311826898466, "grad_norm": 2.5058555603027344, "learning_rate": 9.390509829037381e-07, "loss": 1.9841, "num_input_tokens_seen": 123203056, "step": 121000 }, { "epoch": 2.384280679647969, "grad_norm": 6.203256607055664, "learning_rate": 9.389522936700165e-07, "loss": 1.9885, "num_input_tokens_seen": 123304688, "step": 121100 }, { "epoch": 2.386249532397472, "grad_norm": 1.8381099700927734, "learning_rate": 9.388535297961597e-07, "loss": 1.9858, "num_input_tokens_seen": 123407088, "step": 121200 }, { "epoch": 2.388218385146975, "grad_norm": 1.7783349752426147, "learning_rate": 9.387546912989614e-07, "loss": 2.0128, "num_input_tokens_seen": 123509488, "step": 121300 }, { "epoch": 2.3901872378964777, "grad_norm": 2.1013903617858887, "learning_rate": 9.386557781952287e-07, "loss": 1.9363, "num_input_tokens_seen": 123611888, "step": 121400 }, { "epoch": 2.3921560906459804, "grad_norm": 1.786528468132019, "learning_rate": 9.385567905017806e-07, "loss": 1.9847, "num_input_tokens_seen": 123713712, "step": 121500 }, { "epoch": 2.394124943395483, "grad_norm": 1.7002952098846436, "learning_rate": 9.384577282354493e-07, "loss": 2.011, "num_input_tokens_seen": 123814688, "step": 121600 }, { "epoch": 2.3960937961449864, "grad_norm": 1.940887451171875, "learning_rate": 9.383585914130794e-07, "loss": 1.946, "num_input_tokens_seen": 123916328, "step": 121700 }, { "epoch": 2.398062648894489, "grad_norm": 2.1516923904418945, "learning_rate": 9.382593800515283e-07, "loss": 1.9333, "num_input_tokens_seen": 124017312, "step": 121800 }, { "epoch": 2.400031501643992, "grad_norm": 1.819543480873108, "learning_rate": 9.381600941676661e-07, "loss": 2.0135, "num_input_tokens_seen": 124118480, "step": 121900 }, { "epoch": 2.402000354393495, "grad_norm": 1.9588466882705688, "learning_rate": 9.380607337783754e-07, "loss": 1.9691, "num_input_tokens_seen": 124220128, "step": 122000 }, { "epoch": 2.403969207142998, "grad_norm": 1.8389747142791748, "learning_rate": 9.379612989005517e-07, "loss": 2.0552, "num_input_tokens_seen": 124322528, "step": 122100 }, { "epoch": 2.4059380598925006, "grad_norm": 2.093642473220825, "learning_rate": 9.378617895511032e-07, "loss": 1.983, "num_input_tokens_seen": 124424560, "step": 122200 }, { "epoch": 2.4079069126420034, "grad_norm": 2.068624973297119, "learning_rate": 9.377622057469505e-07, "loss": 2.0576, "num_input_tokens_seen": 124525328, "step": 122300 }, { "epoch": 2.409875765391506, "grad_norm": 6.074371814727783, "learning_rate": 9.376625475050268e-07, "loss": 1.9936, "num_input_tokens_seen": 124627080, "step": 122400 }, { "epoch": 2.4118446181410094, "grad_norm": 1.7730644941329956, "learning_rate": 9.375628148422785e-07, "loss": 1.9775, "num_input_tokens_seen": 124729480, "step": 122500 }, { "epoch": 2.413813470890512, "grad_norm": 2.0941343307495117, "learning_rate": 9.37463007775664e-07, "loss": 1.9818, "num_input_tokens_seen": 124831880, "step": 122600 }, { "epoch": 2.415782323640015, "grad_norm": 2.197303056716919, "learning_rate": 9.373631263221547e-07, "loss": 1.9536, "num_input_tokens_seen": 124933616, "step": 122700 }, { "epoch": 2.4177511763895176, "grad_norm": 1.9617540836334229, "learning_rate": 9.372631704987347e-07, "loss": 2.0507, "num_input_tokens_seen": 125036016, "step": 122800 }, { "epoch": 2.419720029139021, "grad_norm": 1.848714828491211, "learning_rate": 9.371631403224008e-07, "loss": 1.9965, "num_input_tokens_seen": 125137824, "step": 122900 }, { "epoch": 2.4216888818885236, "grad_norm": 1.8612762689590454, "learning_rate": 9.370630358101618e-07, "loss": 1.9897, "num_input_tokens_seen": 125238720, "step": 123000 }, { "epoch": 2.4236577346380264, "grad_norm": 2.054504871368408, "learning_rate": 9.3696285697904e-07, "loss": 1.9864, "num_input_tokens_seen": 125340568, "step": 123100 }, { "epoch": 2.425626587387529, "grad_norm": 1.8542107343673706, "learning_rate": 9.368626038460699e-07, "loss": 1.9357, "num_input_tokens_seen": 125442968, "step": 123200 }, { "epoch": 2.4275954401370323, "grad_norm": 1.8898608684539795, "learning_rate": 9.367622764282987e-07, "loss": 1.9892, "num_input_tokens_seen": 125544640, "step": 123300 }, { "epoch": 2.429564292886535, "grad_norm": 1.8329038619995117, "learning_rate": 9.366618747427863e-07, "loss": 1.9773, "num_input_tokens_seen": 125647040, "step": 123400 }, { "epoch": 2.431533145636038, "grad_norm": 1.8550149202346802, "learning_rate": 9.36561398806605e-07, "loss": 2.033, "num_input_tokens_seen": 125748944, "step": 123500 }, { "epoch": 2.4335019983855406, "grad_norm": 1.8037301301956177, "learning_rate": 9.364608486368399e-07, "loss": 1.9951, "num_input_tokens_seen": 125851256, "step": 123600 }, { "epoch": 2.435470851135044, "grad_norm": 1.756620168685913, "learning_rate": 9.363602242505891e-07, "loss": 1.9914, "num_input_tokens_seen": 125953072, "step": 123700 }, { "epoch": 2.4374397038845466, "grad_norm": 2.3889143466949463, "learning_rate": 9.362595256649621e-07, "loss": 1.9774, "num_input_tokens_seen": 126054664, "step": 123800 }, { "epoch": 2.4394085566340493, "grad_norm": 1.8695127964019775, "learning_rate": 9.361587528970827e-07, "loss": 2.0321, "num_input_tokens_seen": 126155992, "step": 123900 }, { "epoch": 2.441377409383552, "grad_norm": 1.987545132637024, "learning_rate": 9.360579059640859e-07, "loss": 1.9582, "num_input_tokens_seen": 126257528, "step": 124000 }, { "epoch": 2.443346262133055, "grad_norm": 1.7185331583023071, "learning_rate": 9.359569848831203e-07, "loss": 1.9825, "num_input_tokens_seen": 126359416, "step": 124100 }, { "epoch": 2.445315114882558, "grad_norm": 2.0995638370513916, "learning_rate": 9.358559896713463e-07, "loss": 1.9601, "num_input_tokens_seen": 126461816, "step": 124200 }, { "epoch": 2.447283967632061, "grad_norm": 2.0495667457580566, "learning_rate": 9.357549203459374e-07, "loss": 1.966, "num_input_tokens_seen": 126562848, "step": 124300 }, { "epoch": 2.4492528203815636, "grad_norm": 1.9624818563461304, "learning_rate": 9.356537769240797e-07, "loss": 1.9607, "num_input_tokens_seen": 126664672, "step": 124400 }, { "epoch": 2.4512216731310668, "grad_norm": 1.7971683740615845, "learning_rate": 9.355525594229717e-07, "loss": 2.0049, "num_input_tokens_seen": 126765992, "step": 124500 }, { "epoch": 2.4531905258805695, "grad_norm": 1.8750773668289185, "learning_rate": 9.354512678598245e-07, "loss": 2.0275, "num_input_tokens_seen": 126868392, "step": 124600 }, { "epoch": 2.4551593786300723, "grad_norm": 1.8480435609817505, "learning_rate": 9.353499022518621e-07, "loss": 1.9962, "num_input_tokens_seen": 126969944, "step": 124700 }, { "epoch": 2.457128231379575, "grad_norm": 1.8100879192352295, "learning_rate": 9.352484626163206e-07, "loss": 1.9911, "num_input_tokens_seen": 127071792, "step": 124800 }, { "epoch": 2.459097084129078, "grad_norm": 1.913601279258728, "learning_rate": 9.351469489704491e-07, "loss": 1.9825, "num_input_tokens_seen": 127174192, "step": 124900 }, { "epoch": 2.461065936878581, "grad_norm": 1.8896150588989258, "learning_rate": 9.350453613315093e-07, "loss": 1.9921, "num_input_tokens_seen": 127273704, "step": 125000 }, { "epoch": 2.4630347896280838, "grad_norm": 2.3673813343048096, "learning_rate": 9.34943699716775e-07, "loss": 2.0182, "num_input_tokens_seen": 127375568, "step": 125100 }, { "epoch": 2.4650036423775865, "grad_norm": 2.0962560176849365, "learning_rate": 9.34841964143533e-07, "loss": 2.0025, "num_input_tokens_seen": 127477968, "step": 125200 }, { "epoch": 2.4669724951270893, "grad_norm": 1.9968198537826538, "learning_rate": 9.347401546290827e-07, "loss": 1.9551, "num_input_tokens_seen": 127580368, "step": 125300 }, { "epoch": 2.4689413478765925, "grad_norm": 1.636101245880127, "learning_rate": 9.346382711907359e-07, "loss": 1.9726, "num_input_tokens_seen": 127682768, "step": 125400 }, { "epoch": 2.4709102006260952, "grad_norm": 2.0196454524993896, "learning_rate": 9.34536313845817e-07, "loss": 1.9582, "num_input_tokens_seen": 127785168, "step": 125500 }, { "epoch": 2.472879053375598, "grad_norm": 2.038447380065918, "learning_rate": 9.344342826116629e-07, "loss": 1.9848, "num_input_tokens_seen": 127887568, "step": 125600 }, { "epoch": 2.4748479061251007, "grad_norm": 1.792228102684021, "learning_rate": 9.343321775056233e-07, "loss": 1.9615, "num_input_tokens_seen": 127989768, "step": 125700 }, { "epoch": 2.476816758874604, "grad_norm": 2.20328688621521, "learning_rate": 9.342299985450603e-07, "loss": 2.0324, "num_input_tokens_seen": 128090480, "step": 125800 }, { "epoch": 2.4787856116241067, "grad_norm": 1.8574694395065308, "learning_rate": 9.341277457473484e-07, "loss": 1.9777, "num_input_tokens_seen": 128192112, "step": 125900 }, { "epoch": 2.4807544643736095, "grad_norm": 1.7385035753250122, "learning_rate": 9.340254191298749e-07, "loss": 1.9614, "num_input_tokens_seen": 128294512, "step": 126000 }, { "epoch": 2.4827233171231122, "grad_norm": 1.932966709136963, "learning_rate": 9.339230187100398e-07, "loss": 1.9353, "num_input_tokens_seen": 128396912, "step": 126100 }, { "epoch": 2.4846921698726154, "grad_norm": 1.8148736953735352, "learning_rate": 9.338205445052551e-07, "loss": 1.963, "num_input_tokens_seen": 128498392, "step": 126200 }, { "epoch": 2.486661022622118, "grad_norm": 2.132577419281006, "learning_rate": 9.337179965329458e-07, "loss": 2.0064, "num_input_tokens_seen": 128600792, "step": 126300 }, { "epoch": 2.488629875371621, "grad_norm": 2.0273873805999756, "learning_rate": 9.336153748105494e-07, "loss": 1.9702, "num_input_tokens_seen": 128703192, "step": 126400 }, { "epoch": 2.4905987281211237, "grad_norm": 2.0853137969970703, "learning_rate": 9.335126793555157e-07, "loss": 1.9968, "num_input_tokens_seen": 128805592, "step": 126500 }, { "epoch": 2.4925675808706265, "grad_norm": 2.0068466663360596, "learning_rate": 9.334099101853075e-07, "loss": 2.0016, "num_input_tokens_seen": 128907112, "step": 126600 }, { "epoch": 2.4945364336201297, "grad_norm": 2.1706347465515137, "learning_rate": 9.333070673173994e-07, "loss": 1.9742, "num_input_tokens_seen": 129008776, "step": 126700 }, { "epoch": 2.4965052863696324, "grad_norm": 1.9625388383865356, "learning_rate": 9.332041507692793e-07, "loss": 1.9775, "num_input_tokens_seen": 129110656, "step": 126800 }, { "epoch": 2.498474139119135, "grad_norm": 1.8103257417678833, "learning_rate": 9.331011605584471e-07, "loss": 2.0023, "num_input_tokens_seen": 129212296, "step": 126900 }, { "epoch": 2.5004429918686384, "grad_norm": 1.702073097229004, "learning_rate": 9.329980967024156e-07, "loss": 1.9738, "num_input_tokens_seen": 129313952, "step": 127000 }, { "epoch": 2.502411844618141, "grad_norm": 1.8357709646224976, "learning_rate": 9.328949592187097e-07, "loss": 2.013, "num_input_tokens_seen": 129416352, "step": 127100 }, { "epoch": 2.504380697367644, "grad_norm": 2.4339730739593506, "learning_rate": 9.327917481248673e-07, "loss": 2.0074, "num_input_tokens_seen": 129517416, "step": 127200 }, { "epoch": 2.5063495501171467, "grad_norm": 1.863301396369934, "learning_rate": 9.326884634384384e-07, "loss": 1.935, "num_input_tokens_seen": 129619816, "step": 127300 }, { "epoch": 2.5083184028666494, "grad_norm": 2.1330418586730957, "learning_rate": 9.325851051769858e-07, "loss": 1.9947, "num_input_tokens_seen": 129721680, "step": 127400 }, { "epoch": 2.5102872556161526, "grad_norm": 1.7169445753097534, "learning_rate": 9.324816733580846e-07, "loss": 1.9705, "num_input_tokens_seen": 129823256, "step": 127500 }, { "epoch": 2.5122561083656554, "grad_norm": 2.6570725440979004, "learning_rate": 9.323781679993225e-07, "loss": 1.9716, "num_input_tokens_seen": 129924656, "step": 127600 }, { "epoch": 2.514224961115158, "grad_norm": 1.8668783903121948, "learning_rate": 9.322745891182999e-07, "loss": 2.0005, "num_input_tokens_seen": 130027056, "step": 127700 }, { "epoch": 2.516193813864661, "grad_norm": 1.8810783624649048, "learning_rate": 9.321709367326294e-07, "loss": 1.9997, "num_input_tokens_seen": 130129456, "step": 127800 }, { "epoch": 2.518162666614164, "grad_norm": 2.2021889686584473, "learning_rate": 9.320672108599361e-07, "loss": 2.0264, "num_input_tokens_seen": 130231016, "step": 127900 }, { "epoch": 2.520131519363667, "grad_norm": 2.0468647480010986, "learning_rate": 9.31963411517858e-07, "loss": 1.9705, "num_input_tokens_seen": 130333416, "step": 128000 }, { "epoch": 2.5221003721131696, "grad_norm": 1.8266481161117554, "learning_rate": 9.31859538724045e-07, "loss": 2.0478, "num_input_tokens_seen": 130434344, "step": 128100 }, { "epoch": 2.5240692248626724, "grad_norm": 1.7767115831375122, "learning_rate": 9.317555924961599e-07, "loss": 1.9972, "num_input_tokens_seen": 130536744, "step": 128200 }, { "epoch": 2.526038077612175, "grad_norm": 1.9214739799499512, "learning_rate": 9.316515728518779e-07, "loss": 2.027, "num_input_tokens_seen": 130639144, "step": 128300 }, { "epoch": 2.5280069303616783, "grad_norm": 1.867311954498291, "learning_rate": 9.315474798088867e-07, "loss": 1.9905, "num_input_tokens_seen": 130740896, "step": 128400 }, { "epoch": 2.529975783111181, "grad_norm": 2.0441529750823975, "learning_rate": 9.314433133848864e-07, "loss": 1.9569, "num_input_tokens_seen": 130842528, "step": 128500 }, { "epoch": 2.531944635860684, "grad_norm": 1.762548804283142, "learning_rate": 9.313390735975894e-07, "loss": 2.0254, "num_input_tokens_seen": 130944928, "step": 128600 }, { "epoch": 2.533913488610187, "grad_norm": 1.7779361009597778, "learning_rate": 9.312347604647213e-07, "loss": 1.989, "num_input_tokens_seen": 131046696, "step": 128700 }, { "epoch": 2.53588234135969, "grad_norm": 1.7223703861236572, "learning_rate": 9.311303740040193e-07, "loss": 1.9994, "num_input_tokens_seen": 131149096, "step": 128800 }, { "epoch": 2.5378511941091926, "grad_norm": 1.8027669191360474, "learning_rate": 9.310259142332335e-07, "loss": 2.004, "num_input_tokens_seen": 131250664, "step": 128900 }, { "epoch": 2.5398200468586953, "grad_norm": 1.9260095357894897, "learning_rate": 9.309213811701265e-07, "loss": 1.9928, "num_input_tokens_seen": 131353064, "step": 129000 }, { "epoch": 2.541788899608198, "grad_norm": 1.779481291770935, "learning_rate": 9.308167748324731e-07, "loss": 1.967, "num_input_tokens_seen": 131454872, "step": 129100 }, { "epoch": 2.5437577523577013, "grad_norm": 1.8147728443145752, "learning_rate": 9.307120952380607e-07, "loss": 1.9518, "num_input_tokens_seen": 131557272, "step": 129200 }, { "epoch": 2.545726605107204, "grad_norm": 1.7653882503509521, "learning_rate": 9.306073424046896e-07, "loss": 2.0163, "num_input_tokens_seen": 131659128, "step": 129300 }, { "epoch": 2.547695457856707, "grad_norm": 2.157727003097534, "learning_rate": 9.305025163501716e-07, "loss": 2.0305, "num_input_tokens_seen": 131760680, "step": 129400 }, { "epoch": 2.54966431060621, "grad_norm": 1.893654704093933, "learning_rate": 9.303976170923317e-07, "loss": 2.0263, "num_input_tokens_seen": 131863080, "step": 129500 }, { "epoch": 2.5516331633557128, "grad_norm": 1.816401720046997, "learning_rate": 9.302926446490073e-07, "loss": 1.9723, "num_input_tokens_seen": 131963232, "step": 129600 }, { "epoch": 2.5536020161052155, "grad_norm": 3.2825052738189697, "learning_rate": 9.301875990380479e-07, "loss": 2.0053, "num_input_tokens_seen": 132065096, "step": 129700 }, { "epoch": 2.5555708688547183, "grad_norm": 2.1008756160736084, "learning_rate": 9.300824802773156e-07, "loss": 2.0095, "num_input_tokens_seen": 132167496, "step": 129800 }, { "epoch": 2.557539721604221, "grad_norm": 1.709738850593567, "learning_rate": 9.299772883846851e-07, "loss": 2.0523, "num_input_tokens_seen": 132267848, "step": 129900 }, { "epoch": 2.559508574353724, "grad_norm": 1.909054160118103, "learning_rate": 9.298720233780432e-07, "loss": 1.9812, "num_input_tokens_seen": 132369320, "step": 130000 }, { "epoch": 2.561477427103227, "grad_norm": 1.8835586309432983, "learning_rate": 9.297666852752895e-07, "loss": 1.9506, "num_input_tokens_seen": 132471720, "step": 130100 }, { "epoch": 2.5634462798527298, "grad_norm": 1.8786972761154175, "learning_rate": 9.296612740943357e-07, "loss": 1.9678, "num_input_tokens_seen": 132573360, "step": 130200 }, { "epoch": 2.5654151326022325, "grad_norm": 2.090474843978882, "learning_rate": 9.295557898531064e-07, "loss": 1.9588, "num_input_tokens_seen": 132674944, "step": 130300 }, { "epoch": 2.5673839853517357, "grad_norm": 2.0154058933258057, "learning_rate": 9.294502325695378e-07, "loss": 1.9925, "num_input_tokens_seen": 132776768, "step": 130400 }, { "epoch": 2.5693528381012385, "grad_norm": 1.826133131980896, "learning_rate": 9.293446022615794e-07, "loss": 1.9585, "num_input_tokens_seen": 132878672, "step": 130500 }, { "epoch": 2.5713216908507412, "grad_norm": 1.9566258192062378, "learning_rate": 9.292388989471927e-07, "loss": 1.9453, "num_input_tokens_seen": 132980480, "step": 130600 }, { "epoch": 2.573290543600244, "grad_norm": 1.9297854900360107, "learning_rate": 9.291331226443516e-07, "loss": 1.9479, "num_input_tokens_seen": 133081544, "step": 130700 }, { "epoch": 2.5752593963497468, "grad_norm": 1.7936279773712158, "learning_rate": 9.290272733710425e-07, "loss": 1.9936, "num_input_tokens_seen": 133182768, "step": 130800 }, { "epoch": 2.57722824909925, "grad_norm": 1.8617814779281616, "learning_rate": 9.28921351145264e-07, "loss": 2.0311, "num_input_tokens_seen": 133284528, "step": 130900 }, { "epoch": 2.5791971018487527, "grad_norm": 1.7724997997283936, "learning_rate": 9.288153559850274e-07, "loss": 2.0212, "num_input_tokens_seen": 133386016, "step": 131000 }, { "epoch": 2.5811659545982555, "grad_norm": 1.9706727266311646, "learning_rate": 9.287092879083565e-07, "loss": 2.0261, "num_input_tokens_seen": 133487032, "step": 131100 }, { "epoch": 2.5831348073477587, "grad_norm": 6.432122707366943, "learning_rate": 9.286031469332871e-07, "loss": 1.9701, "num_input_tokens_seen": 133588608, "step": 131200 }, { "epoch": 2.5851036600972614, "grad_norm": 1.7948259115219116, "learning_rate": 9.284969330778674e-07, "loss": 1.9814, "num_input_tokens_seen": 133691008, "step": 131300 }, { "epoch": 2.587072512846764, "grad_norm": 1.7874032258987427, "learning_rate": 9.283906463601585e-07, "loss": 2.004, "num_input_tokens_seen": 133792328, "step": 131400 }, { "epoch": 2.589041365596267, "grad_norm": 1.7927825450897217, "learning_rate": 9.282842867982332e-07, "loss": 1.9606, "num_input_tokens_seen": 133894208, "step": 131500 }, { "epoch": 2.5910102183457697, "grad_norm": 2.154865026473999, "learning_rate": 9.281778544101775e-07, "loss": 1.9693, "num_input_tokens_seen": 133996608, "step": 131600 }, { "epoch": 2.592979071095273, "grad_norm": 1.800648808479309, "learning_rate": 9.28071349214089e-07, "loss": 1.9697, "num_input_tokens_seen": 134099008, "step": 131700 }, { "epoch": 2.5949479238447757, "grad_norm": 2.003859281539917, "learning_rate": 9.27964771228078e-07, "loss": 1.9429, "num_input_tokens_seen": 134201408, "step": 131800 }, { "epoch": 2.5969167765942784, "grad_norm": 1.988529920578003, "learning_rate": 9.278581204702672e-07, "loss": 1.9445, "num_input_tokens_seen": 134303808, "step": 131900 }, { "epoch": 2.5988856293437816, "grad_norm": 2.2726986408233643, "learning_rate": 9.277513969587919e-07, "loss": 1.9789, "num_input_tokens_seen": 134405456, "step": 132000 }, { "epoch": 2.6008544820932844, "grad_norm": 1.8805309534072876, "learning_rate": 9.276446007117993e-07, "loss": 1.985, "num_input_tokens_seen": 134507416, "step": 132100 }, { "epoch": 2.602823334842787, "grad_norm": 1.753617525100708, "learning_rate": 9.275377317474493e-07, "loss": 1.9638, "num_input_tokens_seen": 134609816, "step": 132200 }, { "epoch": 2.60479218759229, "grad_norm": 2.085961103439331, "learning_rate": 9.27430790083914e-07, "loss": 1.9709, "num_input_tokens_seen": 134711168, "step": 132300 }, { "epoch": 2.6067610403417927, "grad_norm": 1.8576356172561646, "learning_rate": 9.27323775739378e-07, "loss": 2.0193, "num_input_tokens_seen": 134813568, "step": 132400 }, { "epoch": 2.6087298930912954, "grad_norm": 1.9671186208724976, "learning_rate": 9.27216688732038e-07, "loss": 1.975, "num_input_tokens_seen": 134915104, "step": 132500 }, { "epoch": 2.6106987458407986, "grad_norm": 2.198086977005005, "learning_rate": 9.271095290801036e-07, "loss": 1.9641, "num_input_tokens_seen": 135017504, "step": 132600 }, { "epoch": 2.6126675985903014, "grad_norm": 1.8338600397109985, "learning_rate": 9.270022968017961e-07, "loss": 1.9719, "num_input_tokens_seen": 135117920, "step": 132700 }, { "epoch": 2.614636451339804, "grad_norm": 4.461668968200684, "learning_rate": 9.268949919153496e-07, "loss": 1.9957, "num_input_tokens_seen": 135219680, "step": 132800 }, { "epoch": 2.6166053040893074, "grad_norm": 2.001342535018921, "learning_rate": 9.267876144390102e-07, "loss": 1.9878, "num_input_tokens_seen": 135321280, "step": 132900 }, { "epoch": 2.61857415683881, "grad_norm": 1.8028596639633179, "learning_rate": 9.266801643910368e-07, "loss": 1.9808, "num_input_tokens_seen": 135422760, "step": 133000 }, { "epoch": 2.620543009588313, "grad_norm": 1.9720797538757324, "learning_rate": 9.265726417897002e-07, "loss": 2.0061, "num_input_tokens_seen": 135523760, "step": 133100 }, { "epoch": 2.6225118623378156, "grad_norm": 1.8140394687652588, "learning_rate": 9.264650466532836e-07, "loss": 1.9839, "num_input_tokens_seen": 135626160, "step": 133200 }, { "epoch": 2.6244807150873184, "grad_norm": 2.1675796508789062, "learning_rate": 9.263573790000829e-07, "loss": 1.974, "num_input_tokens_seen": 135728560, "step": 133300 }, { "epoch": 2.6264495678368216, "grad_norm": 2.0989134311676025, "learning_rate": 9.26249638848406e-07, "loss": 2.0111, "num_input_tokens_seen": 135830960, "step": 133400 }, { "epoch": 2.6284184205863244, "grad_norm": 2.260593891143799, "learning_rate": 9.261418262165731e-07, "loss": 2.0108, "num_input_tokens_seen": 135933360, "step": 133500 }, { "epoch": 2.630387273335827, "grad_norm": 2.1290645599365234, "learning_rate": 9.26033941122917e-07, "loss": 2.0208, "num_input_tokens_seen": 136035672, "step": 133600 }, { "epoch": 2.6323561260853303, "grad_norm": 2.066185235977173, "learning_rate": 9.259259835857825e-07, "loss": 2.0209, "num_input_tokens_seen": 136137256, "step": 133700 }, { "epoch": 2.634324978834833, "grad_norm": 1.9091438055038452, "learning_rate": 9.258179536235268e-07, "loss": 1.9751, "num_input_tokens_seen": 136239656, "step": 133800 }, { "epoch": 2.636293831584336, "grad_norm": 1.862436056137085, "learning_rate": 9.257098512545196e-07, "loss": 1.9894, "num_input_tokens_seen": 136342056, "step": 133900 }, { "epoch": 2.6382626843338386, "grad_norm": 1.8303775787353516, "learning_rate": 9.256016764971429e-07, "loss": 1.9832, "num_input_tokens_seen": 136442880, "step": 134000 }, { "epoch": 2.6402315370833414, "grad_norm": 2.034001350402832, "learning_rate": 9.254934293697909e-07, "loss": 1.9743, "num_input_tokens_seen": 136544760, "step": 134100 }, { "epoch": 2.6422003898328446, "grad_norm": 1.912581205368042, "learning_rate": 9.253851098908698e-07, "loss": 2.0062, "num_input_tokens_seen": 136647160, "step": 134200 }, { "epoch": 2.6441692425823473, "grad_norm": 2.018368721008301, "learning_rate": 9.252767180787988e-07, "loss": 2.0146, "num_input_tokens_seen": 136749560, "step": 134300 }, { "epoch": 2.64613809533185, "grad_norm": 1.8418313264846802, "learning_rate": 9.251682539520087e-07, "loss": 1.997, "num_input_tokens_seen": 136850640, "step": 134400 }, { "epoch": 2.648106948081353, "grad_norm": 1.9146231412887573, "learning_rate": 9.250597175289432e-07, "loss": 1.9582, "num_input_tokens_seen": 136953040, "step": 134500 }, { "epoch": 2.650075800830856, "grad_norm": 1.7362430095672607, "learning_rate": 9.249511088280577e-07, "loss": 2.0079, "num_input_tokens_seen": 137054728, "step": 134600 }, { "epoch": 2.652044653580359, "grad_norm": 1.7822071313858032, "learning_rate": 9.248424278678204e-07, "loss": 2.0114, "num_input_tokens_seen": 137157128, "step": 134700 }, { "epoch": 2.6540135063298615, "grad_norm": 1.8319240808486938, "learning_rate": 9.247336746667116e-07, "loss": 1.9643, "num_input_tokens_seen": 137258224, "step": 134800 }, { "epoch": 2.6559823590793643, "grad_norm": 2.119603395462036, "learning_rate": 9.246248492432239e-07, "loss": 1.9493, "num_input_tokens_seen": 137359768, "step": 134900 }, { "epoch": 2.657951211828867, "grad_norm": 1.8390417098999023, "learning_rate": 9.245159516158617e-07, "loss": 1.9964, "num_input_tokens_seen": 137461728, "step": 135000 }, { "epoch": 2.6599200645783703, "grad_norm": 1.992293119430542, "learning_rate": 9.244069818031426e-07, "loss": 1.9381, "num_input_tokens_seen": 137564128, "step": 135100 }, { "epoch": 2.661888917327873, "grad_norm": 1.7668195962905884, "learning_rate": 9.242979398235959e-07, "loss": 1.9997, "num_input_tokens_seen": 137665824, "step": 135200 }, { "epoch": 2.663857770077376, "grad_norm": 15.580219268798828, "learning_rate": 9.241888256957633e-07, "loss": 2.0132, "num_input_tokens_seen": 137768224, "step": 135300 }, { "epoch": 2.665826622826879, "grad_norm": 1.9911000728607178, "learning_rate": 9.240796394381985e-07, "loss": 1.9664, "num_input_tokens_seen": 137869808, "step": 135400 }, { "epoch": 2.6677954755763817, "grad_norm": 1.9079303741455078, "learning_rate": 9.23970381069468e-07, "loss": 1.9713, "num_input_tokens_seen": 137972208, "step": 135500 }, { "epoch": 2.6697643283258845, "grad_norm": 2.3192949295043945, "learning_rate": 9.238610506081501e-07, "loss": 1.9664, "num_input_tokens_seen": 138073880, "step": 135600 }, { "epoch": 2.6717331810753873, "grad_norm": 1.758529782295227, "learning_rate": 9.237516480728356e-07, "loss": 2.032, "num_input_tokens_seen": 138176280, "step": 135700 }, { "epoch": 2.67370203382489, "grad_norm": 2.1973562240600586, "learning_rate": 9.236421734821273e-07, "loss": 2.0422, "num_input_tokens_seen": 138276816, "step": 135800 }, { "epoch": 2.6756708865743932, "grad_norm": 2.0563364028930664, "learning_rate": 9.235326268546406e-07, "loss": 1.9659, "num_input_tokens_seen": 138378616, "step": 135900 }, { "epoch": 2.677639739323896, "grad_norm": 2.002542018890381, "learning_rate": 9.23423008209003e-07, "loss": 1.963, "num_input_tokens_seen": 138481016, "step": 136000 }, { "epoch": 2.6796085920733987, "grad_norm": 2.061286211013794, "learning_rate": 9.233133175638541e-07, "loss": 2.0181, "num_input_tokens_seen": 138582584, "step": 136100 }, { "epoch": 2.681577444822902, "grad_norm": 1.6337310075759888, "learning_rate": 9.23203554937846e-07, "loss": 1.9488, "num_input_tokens_seen": 138684168, "step": 136200 }, { "epoch": 2.6835462975724047, "grad_norm": 1.8827173709869385, "learning_rate": 9.230937203496428e-07, "loss": 1.9828, "num_input_tokens_seen": 138786568, "step": 136300 }, { "epoch": 2.6855151503219075, "grad_norm": 2.247293472290039, "learning_rate": 9.22983813817921e-07, "loss": 1.9879, "num_input_tokens_seen": 138888968, "step": 136400 }, { "epoch": 2.68748400307141, "grad_norm": 2.4556334018707275, "learning_rate": 9.228738353613693e-07, "loss": 1.9555, "num_input_tokens_seen": 138990600, "step": 136500 }, { "epoch": 2.689452855820913, "grad_norm": 3.499016523361206, "learning_rate": 9.227637849986886e-07, "loss": 1.9704, "num_input_tokens_seen": 139093000, "step": 136600 }, { "epoch": 2.691421708570416, "grad_norm": 2.174551248550415, "learning_rate": 9.22653662748592e-07, "loss": 1.9854, "num_input_tokens_seen": 139193960, "step": 136700 }, { "epoch": 2.693390561319919, "grad_norm": 1.9812849760055542, "learning_rate": 9.225434686298048e-07, "loss": 2.0031, "num_input_tokens_seen": 139293784, "step": 136800 }, { "epoch": 2.6953594140694217, "grad_norm": 1.7438524961471558, "learning_rate": 9.224332026610647e-07, "loss": 1.9305, "num_input_tokens_seen": 139396184, "step": 136900 }, { "epoch": 2.6973282668189245, "grad_norm": 1.6979296207427979, "learning_rate": 9.223228648611214e-07, "loss": 1.9994, "num_input_tokens_seen": 139498104, "step": 137000 }, { "epoch": 2.6992971195684277, "grad_norm": 1.828908085823059, "learning_rate": 9.222124552487372e-07, "loss": 1.9572, "num_input_tokens_seen": 139600504, "step": 137100 }, { "epoch": 2.7012659723179304, "grad_norm": 1.8999994993209839, "learning_rate": 9.221019738426859e-07, "loss": 1.9585, "num_input_tokens_seen": 139702904, "step": 137200 }, { "epoch": 2.703234825067433, "grad_norm": 2.0553765296936035, "learning_rate": 9.21991420661754e-07, "loss": 1.9695, "num_input_tokens_seen": 139803896, "step": 137300 }, { "epoch": 2.705203677816936, "grad_norm": 1.7925704717636108, "learning_rate": 9.218807957247406e-07, "loss": 1.9644, "num_input_tokens_seen": 139904408, "step": 137400 }, { "epoch": 2.7071725305664387, "grad_norm": 1.7893779277801514, "learning_rate": 9.217700990504559e-07, "loss": 2.0034, "num_input_tokens_seen": 140006168, "step": 137500 }, { "epoch": 2.709141383315942, "grad_norm": 1.9949560165405273, "learning_rate": 9.216593306577234e-07, "loss": 1.9777, "num_input_tokens_seen": 140108568, "step": 137600 }, { "epoch": 2.7111102360654447, "grad_norm": 2.1235673427581787, "learning_rate": 9.21548490565378e-07, "loss": 2.0064, "num_input_tokens_seen": 140210664, "step": 137700 }, { "epoch": 2.7130790888149474, "grad_norm": 2.0905706882476807, "learning_rate": 9.214375787922673e-07, "loss": 1.9855, "num_input_tokens_seen": 140312432, "step": 137800 }, { "epoch": 2.7150479415644506, "grad_norm": 1.8522093296051025, "learning_rate": 9.21326595357251e-07, "loss": 1.9697, "num_input_tokens_seen": 140413056, "step": 137900 }, { "epoch": 2.7170167943139534, "grad_norm": 1.9855432510375977, "learning_rate": 9.212155402792008e-07, "loss": 2.0105, "num_input_tokens_seen": 140513744, "step": 138000 }, { "epoch": 2.718985647063456, "grad_norm": 1.9174432754516602, "learning_rate": 9.211044135770005e-07, "loss": 2.0049, "num_input_tokens_seen": 140616040, "step": 138100 }, { "epoch": 2.720954499812959, "grad_norm": 1.7407281398773193, "learning_rate": 9.209932152695466e-07, "loss": 2.0164, "num_input_tokens_seen": 140717936, "step": 138200 }, { "epoch": 2.7229233525624617, "grad_norm": 1.7572743892669678, "learning_rate": 9.208819453757473e-07, "loss": 2.0132, "num_input_tokens_seen": 140820144, "step": 138300 }, { "epoch": 2.724892205311965, "grad_norm": 1.9282379150390625, "learning_rate": 9.207706039145229e-07, "loss": 1.9429, "num_input_tokens_seen": 140921808, "step": 138400 }, { "epoch": 2.7268610580614676, "grad_norm": 1.9206793308258057, "learning_rate": 9.206591909048063e-07, "loss": 2.027, "num_input_tokens_seen": 141023760, "step": 138500 }, { "epoch": 2.7288299108109704, "grad_norm": 2.001131296157837, "learning_rate": 9.205477063655424e-07, "loss": 2.0271, "num_input_tokens_seen": 141123640, "step": 138600 }, { "epoch": 2.7307987635604736, "grad_norm": 1.8215125799179077, "learning_rate": 9.204361503156881e-07, "loss": 2.0414, "num_input_tokens_seen": 141225992, "step": 138700 }, { "epoch": 2.7327676163099763, "grad_norm": 2.420888662338257, "learning_rate": 9.203245227742125e-07, "loss": 1.9799, "num_input_tokens_seen": 141328392, "step": 138800 }, { "epoch": 2.734736469059479, "grad_norm": 1.938777208328247, "learning_rate": 9.202128237600969e-07, "loss": 1.9957, "num_input_tokens_seen": 141429096, "step": 138900 }, { "epoch": 2.736705321808982, "grad_norm": 4.348512649536133, "learning_rate": 9.201010532923349e-07, "loss": 2.0387, "num_input_tokens_seen": 141530920, "step": 139000 }, { "epoch": 2.7386741745584846, "grad_norm": 1.7346715927124023, "learning_rate": 9.199892113899322e-07, "loss": 1.9955, "num_input_tokens_seen": 141631704, "step": 139100 }, { "epoch": 2.7406430273079874, "grad_norm": 1.8914837837219238, "learning_rate": 9.198772980719063e-07, "loss": 1.9765, "num_input_tokens_seen": 141733880, "step": 139200 }, { "epoch": 2.7426118800574906, "grad_norm": 1.7075005769729614, "learning_rate": 9.197653133572873e-07, "loss": 1.9777, "num_input_tokens_seen": 141834960, "step": 139300 }, { "epoch": 2.7445807328069933, "grad_norm": 2.315007448196411, "learning_rate": 9.196532572651172e-07, "loss": 2.0166, "num_input_tokens_seen": 141936176, "step": 139400 }, { "epoch": 2.746549585556496, "grad_norm": 1.8903642892837524, "learning_rate": 9.195411298144504e-07, "loss": 1.9442, "num_input_tokens_seen": 142038576, "step": 139500 }, { "epoch": 2.7485184383059993, "grad_norm": 1.965807557106018, "learning_rate": 9.194289310243528e-07, "loss": 1.9608, "num_input_tokens_seen": 142139456, "step": 139600 }, { "epoch": 2.750487291055502, "grad_norm": 1.9385151863098145, "learning_rate": 9.193166609139033e-07, "loss": 1.9783, "num_input_tokens_seen": 142241856, "step": 139700 }, { "epoch": 2.752456143805005, "grad_norm": 2.076444387435913, "learning_rate": 9.192043195021922e-07, "loss": 2.0202, "num_input_tokens_seen": 142344256, "step": 139800 }, { "epoch": 2.7544249965545076, "grad_norm": 4.172542095184326, "learning_rate": 9.190919068083224e-07, "loss": 2.0064, "num_input_tokens_seen": 142446032, "step": 139900 }, { "epoch": 2.7563938493040103, "grad_norm": 1.884360671043396, "learning_rate": 9.189794228514085e-07, "loss": 2.0104, "num_input_tokens_seen": 142548432, "step": 140000 }, { "epoch": 2.7583627020535135, "grad_norm": 1.7790875434875488, "learning_rate": 9.188668676505777e-07, "loss": 1.969, "num_input_tokens_seen": 142650280, "step": 140100 }, { "epoch": 2.7603315548030163, "grad_norm": 1.8419520854949951, "learning_rate": 9.187542412249689e-07, "loss": 1.9908, "num_input_tokens_seen": 142751704, "step": 140200 }, { "epoch": 2.762300407552519, "grad_norm": 1.9317359924316406, "learning_rate": 9.186415435937333e-07, "loss": 1.9929, "num_input_tokens_seen": 142854104, "step": 140300 }, { "epoch": 2.7642692603020222, "grad_norm": 2.1541104316711426, "learning_rate": 9.185287747760342e-07, "loss": 1.9594, "num_input_tokens_seen": 142955344, "step": 140400 }, { "epoch": 2.766238113051525, "grad_norm": 1.8215970993041992, "learning_rate": 9.18415934791047e-07, "loss": 2.0056, "num_input_tokens_seen": 143057744, "step": 140500 }, { "epoch": 2.7682069658010278, "grad_norm": 2.3591055870056152, "learning_rate": 9.183030236579591e-07, "loss": 1.9826, "num_input_tokens_seen": 143160144, "step": 140600 }, { "epoch": 2.7701758185505305, "grad_norm": 1.8065130710601807, "learning_rate": 9.181900413959703e-07, "loss": 2.0148, "num_input_tokens_seen": 143260632, "step": 140700 }, { "epoch": 2.7721446713000333, "grad_norm": 1.9266314506530762, "learning_rate": 9.18076988024292e-07, "loss": 1.9509, "num_input_tokens_seen": 143362720, "step": 140800 }, { "epoch": 2.7741135240495365, "grad_norm": 2.158468008041382, "learning_rate": 9.179638635621481e-07, "loss": 1.971, "num_input_tokens_seen": 143463128, "step": 140900 }, { "epoch": 2.7760823767990392, "grad_norm": 2.2344703674316406, "learning_rate": 9.178506680287746e-07, "loss": 1.9975, "num_input_tokens_seen": 143564736, "step": 141000 }, { "epoch": 2.778051229548542, "grad_norm": 1.9476065635681152, "learning_rate": 9.177374014434192e-07, "loss": 1.9718, "num_input_tokens_seen": 143664912, "step": 141100 }, { "epoch": 2.780020082298045, "grad_norm": 1.8100794553756714, "learning_rate": 9.176240638253421e-07, "loss": 1.9747, "num_input_tokens_seen": 143767312, "step": 141200 }, { "epoch": 2.781988935047548, "grad_norm": 1.922595739364624, "learning_rate": 9.175106551938153e-07, "loss": 1.9852, "num_input_tokens_seen": 143868904, "step": 141300 }, { "epoch": 2.7839577877970507, "grad_norm": 2.0649492740631104, "learning_rate": 9.173971755681231e-07, "loss": 1.9941, "num_input_tokens_seen": 143971304, "step": 141400 }, { "epoch": 2.7859266405465535, "grad_norm": 2.0110702514648438, "learning_rate": 9.172836249675617e-07, "loss": 1.9563, "num_input_tokens_seen": 144073704, "step": 141500 }, { "epoch": 2.7878954932960562, "grad_norm": 1.9530744552612305, "learning_rate": 9.171700034114394e-07, "loss": 2.0128, "num_input_tokens_seen": 144175584, "step": 141600 }, { "epoch": 2.789864346045559, "grad_norm": 2.0976758003234863, "learning_rate": 9.170563109190766e-07, "loss": 2.004, "num_input_tokens_seen": 144277984, "step": 141700 }, { "epoch": 2.791833198795062, "grad_norm": 2.0704092979431152, "learning_rate": 9.169425475098056e-07, "loss": 1.9728, "num_input_tokens_seen": 144379536, "step": 141800 }, { "epoch": 2.793802051544565, "grad_norm": 1.815583348274231, "learning_rate": 9.16828713202971e-07, "loss": 1.9135, "num_input_tokens_seen": 144481936, "step": 141900 }, { "epoch": 2.7957709042940677, "grad_norm": 1.7426722049713135, "learning_rate": 9.167148080179297e-07, "loss": 1.9656, "num_input_tokens_seen": 144584336, "step": 142000 }, { "epoch": 2.797739757043571, "grad_norm": 2.062602996826172, "learning_rate": 9.166008319740499e-07, "loss": 1.993, "num_input_tokens_seen": 144685840, "step": 142100 }, { "epoch": 2.7997086097930737, "grad_norm": 2.510030746459961, "learning_rate": 9.164867850907124e-07, "loss": 1.9507, "num_input_tokens_seen": 144788240, "step": 142200 }, { "epoch": 2.8016774625425764, "grad_norm": 1.8817150592803955, "learning_rate": 9.1637266738731e-07, "loss": 1.9942, "num_input_tokens_seen": 144889976, "step": 142300 }, { "epoch": 2.803646315292079, "grad_norm": 1.7231566905975342, "learning_rate": 9.162584788832473e-07, "loss": 2.0297, "num_input_tokens_seen": 144992376, "step": 142400 }, { "epoch": 2.805615168041582, "grad_norm": 1.8418123722076416, "learning_rate": 9.16144219597941e-07, "loss": 2.0164, "num_input_tokens_seen": 145094776, "step": 142500 }, { "epoch": 2.807584020791085, "grad_norm": 1.9155409336090088, "learning_rate": 9.160298895508202e-07, "loss": 1.948, "num_input_tokens_seen": 145195616, "step": 142600 }, { "epoch": 2.809552873540588, "grad_norm": 1.774415373802185, "learning_rate": 9.159154887613257e-07, "loss": 1.9702, "num_input_tokens_seen": 145298016, "step": 142700 }, { "epoch": 2.8115217262900907, "grad_norm": 1.7046304941177368, "learning_rate": 9.158010172489102e-07, "loss": 1.9656, "num_input_tokens_seen": 145400416, "step": 142800 }, { "epoch": 2.813490579039594, "grad_norm": 2.1160473823547363, "learning_rate": 9.156864750330389e-07, "loss": 1.9631, "num_input_tokens_seen": 145502816, "step": 142900 }, { "epoch": 2.8154594317890966, "grad_norm": 1.7345004081726074, "learning_rate": 9.155718621331885e-07, "loss": 1.9819, "num_input_tokens_seen": 145605216, "step": 143000 }, { "epoch": 2.8174282845385994, "grad_norm": 2.054319143295288, "learning_rate": 9.15457178568848e-07, "loss": 2.0453, "num_input_tokens_seen": 145707616, "step": 143100 }, { "epoch": 2.819397137288102, "grad_norm": 1.8499298095703125, "learning_rate": 9.153424243595185e-07, "loss": 1.9449, "num_input_tokens_seen": 145810016, "step": 143200 }, { "epoch": 2.821365990037605, "grad_norm": 1.8021115064620972, "learning_rate": 9.152275995247128e-07, "loss": 1.9548, "num_input_tokens_seen": 145912416, "step": 143300 }, { "epoch": 2.823334842787108, "grad_norm": 1.7581770420074463, "learning_rate": 9.15112704083956e-07, "loss": 1.9784, "num_input_tokens_seen": 146013976, "step": 143400 }, { "epoch": 2.825303695536611, "grad_norm": 2.0402488708496094, "learning_rate": 9.149977380567852e-07, "loss": 2.0034, "num_input_tokens_seen": 146115888, "step": 143500 }, { "epoch": 2.8272725482861136, "grad_norm": 1.9908995628356934, "learning_rate": 9.148827014627492e-07, "loss": 1.9587, "num_input_tokens_seen": 146218288, "step": 143600 }, { "epoch": 2.8292414010356164, "grad_norm": 1.9173650741577148, "learning_rate": 9.147675943214091e-07, "loss": 1.9649, "num_input_tokens_seen": 146320688, "step": 143700 }, { "epoch": 2.8312102537851196, "grad_norm": 1.8674744367599487, "learning_rate": 9.14652416652338e-07, "loss": 2.0627, "num_input_tokens_seen": 146422056, "step": 143800 }, { "epoch": 2.8331791065346223, "grad_norm": 1.8405635356903076, "learning_rate": 9.145371684751206e-07, "loss": 1.9678, "num_input_tokens_seen": 146523976, "step": 143900 }, { "epoch": 2.835147959284125, "grad_norm": 1.8517303466796875, "learning_rate": 9.144218498093541e-07, "loss": 1.9528, "num_input_tokens_seen": 146626376, "step": 144000 }, { "epoch": 2.837116812033628, "grad_norm": 2.000028610229492, "learning_rate": 9.143064606746476e-07, "loss": 1.9807, "num_input_tokens_seen": 146728776, "step": 144100 }, { "epoch": 2.8390856647831306, "grad_norm": 1.7356005907058716, "learning_rate": 9.141910010906218e-07, "loss": 2.0446, "num_input_tokens_seen": 146830576, "step": 144200 }, { "epoch": 2.841054517532634, "grad_norm": 1.9533261060714722, "learning_rate": 9.140754710769097e-07, "loss": 1.9862, "num_input_tokens_seen": 146932976, "step": 144300 }, { "epoch": 2.8430233702821366, "grad_norm": 1.9368846416473389, "learning_rate": 9.139598706531562e-07, "loss": 1.9735, "num_input_tokens_seen": 147034608, "step": 144400 }, { "epoch": 2.8449922230316393, "grad_norm": 2.0260725021362305, "learning_rate": 9.138441998390183e-07, "loss": 2.0029, "num_input_tokens_seen": 147135624, "step": 144500 }, { "epoch": 2.8469610757811425, "grad_norm": 4.3465352058410645, "learning_rate": 9.137284586541646e-07, "loss": 1.9741, "num_input_tokens_seen": 147238024, "step": 144600 }, { "epoch": 2.8489299285306453, "grad_norm": 1.8397555351257324, "learning_rate": 9.136126471182761e-07, "loss": 1.9228, "num_input_tokens_seen": 147338480, "step": 144700 }, { "epoch": 2.850898781280148, "grad_norm": 2.2060608863830566, "learning_rate": 9.134967652510456e-07, "loss": 1.9574, "num_input_tokens_seen": 147440360, "step": 144800 }, { "epoch": 2.852867634029651, "grad_norm": 1.9449845552444458, "learning_rate": 9.133808130721777e-07, "loss": 2.0137, "num_input_tokens_seen": 147542072, "step": 144900 }, { "epoch": 2.8548364867791536, "grad_norm": 1.9381047487258911, "learning_rate": 9.13264790601389e-07, "loss": 1.952, "num_input_tokens_seen": 147644472, "step": 145000 }, { "epoch": 2.856805339528657, "grad_norm": 1.975183129310608, "learning_rate": 9.131486978584085e-07, "loss": 2.0197, "num_input_tokens_seen": 147746872, "step": 145100 }, { "epoch": 2.8587741922781595, "grad_norm": 2.124565839767456, "learning_rate": 9.130325348629766e-07, "loss": 1.9374, "num_input_tokens_seen": 147849272, "step": 145200 }, { "epoch": 2.8607430450276623, "grad_norm": 2.788569927215576, "learning_rate": 9.129163016348458e-07, "loss": 1.9459, "num_input_tokens_seen": 147951672, "step": 145300 }, { "epoch": 2.8627118977771655, "grad_norm": 1.9773931503295898, "learning_rate": 9.127999981937806e-07, "loss": 1.9606, "num_input_tokens_seen": 148054072, "step": 145400 }, { "epoch": 2.8646807505266683, "grad_norm": 1.9272105693817139, "learning_rate": 9.126836245595572e-07, "loss": 2.0163, "num_input_tokens_seen": 148156472, "step": 145500 }, { "epoch": 2.866649603276171, "grad_norm": 1.7956663370132446, "learning_rate": 9.125671807519644e-07, "loss": 1.9884, "num_input_tokens_seen": 148257416, "step": 145600 }, { "epoch": 2.868618456025674, "grad_norm": 1.8678843975067139, "learning_rate": 9.124506667908023e-07, "loss": 2.0125, "num_input_tokens_seen": 148359816, "step": 145700 }, { "epoch": 2.8705873087751765, "grad_norm": 1.9273408651351929, "learning_rate": 9.12334082695883e-07, "loss": 1.9562, "num_input_tokens_seen": 148462216, "step": 145800 }, { "epoch": 2.8725561615246793, "grad_norm": 2.330432653427124, "learning_rate": 9.122174284870307e-07, "loss": 1.9803, "num_input_tokens_seen": 148563888, "step": 145900 }, { "epoch": 2.8745250142741825, "grad_norm": 1.7445673942565918, "learning_rate": 9.121007041840816e-07, "loss": 1.9816, "num_input_tokens_seen": 148665624, "step": 146000 }, { "epoch": 2.8764938670236853, "grad_norm": 2.0610992908477783, "learning_rate": 9.119839098068835e-07, "loss": 2.0486, "num_input_tokens_seen": 148766752, "step": 146100 }, { "epoch": 2.878462719773188, "grad_norm": 1.9714587926864624, "learning_rate": 9.118670453752964e-07, "loss": 1.9396, "num_input_tokens_seen": 148868736, "step": 146200 }, { "epoch": 2.880431572522691, "grad_norm": 1.8859783411026, "learning_rate": 9.11750110909192e-07, "loss": 1.9396, "num_input_tokens_seen": 148971136, "step": 146300 }, { "epoch": 2.882400425272194, "grad_norm": 2.111337423324585, "learning_rate": 9.116331064284542e-07, "loss": 2.0412, "num_input_tokens_seen": 149071728, "step": 146400 }, { "epoch": 2.8843692780216967, "grad_norm": 2.2499639987945557, "learning_rate": 9.115160319529783e-07, "loss": 2.0356, "num_input_tokens_seen": 149173560, "step": 146500 }, { "epoch": 2.8863381307711995, "grad_norm": 1.7903567552566528, "learning_rate": 9.113988875026724e-07, "loss": 2.0333, "num_input_tokens_seen": 149275328, "step": 146600 }, { "epoch": 2.8883069835207023, "grad_norm": 1.816192388534546, "learning_rate": 9.112816730974552e-07, "loss": 1.964, "num_input_tokens_seen": 149376408, "step": 146700 }, { "epoch": 2.8902758362702055, "grad_norm": 2.165644884109497, "learning_rate": 9.111643887572586e-07, "loss": 1.9503, "num_input_tokens_seen": 149478104, "step": 146800 }, { "epoch": 2.892244689019708, "grad_norm": 1.7579931020736694, "learning_rate": 9.110470345020257e-07, "loss": 1.9369, "num_input_tokens_seen": 149579984, "step": 146900 }, { "epoch": 2.894213541769211, "grad_norm": 2.125364303588867, "learning_rate": 9.109296103517115e-07, "loss": 2.013, "num_input_tokens_seen": 149682384, "step": 147000 }, { "epoch": 2.896182394518714, "grad_norm": 1.868359923362732, "learning_rate": 9.10812116326283e-07, "loss": 1.9916, "num_input_tokens_seen": 149784784, "step": 147100 }, { "epoch": 2.898151247268217, "grad_norm": 2.1574387550354004, "learning_rate": 9.10694552445719e-07, "loss": 1.9398, "num_input_tokens_seen": 149887184, "step": 147200 }, { "epoch": 2.9001201000177197, "grad_norm": 1.8687632083892822, "learning_rate": 9.105769187300102e-07, "loss": 2.0205, "num_input_tokens_seen": 149989584, "step": 147300 }, { "epoch": 2.9020889527672225, "grad_norm": 1.9556429386138916, "learning_rate": 9.104592151991594e-07, "loss": 2.0266, "num_input_tokens_seen": 150090624, "step": 147400 }, { "epoch": 2.904057805516725, "grad_norm": 1.843937873840332, "learning_rate": 9.103414418731811e-07, "loss": 1.9608, "num_input_tokens_seen": 150193024, "step": 147500 }, { "epoch": 2.9060266582662284, "grad_norm": 1.852134346961975, "learning_rate": 9.102235987721016e-07, "loss": 1.9864, "num_input_tokens_seen": 150294872, "step": 147600 }, { "epoch": 2.907995511015731, "grad_norm": 1.8409780263900757, "learning_rate": 9.10105685915959e-07, "loss": 1.9823, "num_input_tokens_seen": 150396128, "step": 147700 }, { "epoch": 2.909964363765234, "grad_norm": 4.855228424072266, "learning_rate": 9.099877033248036e-07, "loss": 1.9702, "num_input_tokens_seen": 150497856, "step": 147800 }, { "epoch": 2.911933216514737, "grad_norm": 1.8119330406188965, "learning_rate": 9.098696510186973e-07, "loss": 1.9961, "num_input_tokens_seen": 150599704, "step": 147900 }, { "epoch": 2.91390206926424, "grad_norm": 2.1123673915863037, "learning_rate": 9.097515290177138e-07, "loss": 1.9562, "num_input_tokens_seen": 150701416, "step": 148000 }, { "epoch": 2.9158709220137426, "grad_norm": 2.0797832012176514, "learning_rate": 9.096333373419387e-07, "loss": 1.9961, "num_input_tokens_seen": 150803816, "step": 148100 }, { "epoch": 2.9178397747632454, "grad_norm": 1.924194097518921, "learning_rate": 9.095150760114696e-07, "loss": 2.0, "num_input_tokens_seen": 150906216, "step": 148200 }, { "epoch": 2.919808627512748, "grad_norm": 2.2523162364959717, "learning_rate": 9.093967450464159e-07, "loss": 1.9633, "num_input_tokens_seen": 151007952, "step": 148300 }, { "epoch": 2.921777480262251, "grad_norm": 2.0440046787261963, "learning_rate": 9.092783444668988e-07, "loss": 1.9963, "num_input_tokens_seen": 151110080, "step": 148400 }, { "epoch": 2.923746333011754, "grad_norm": 1.921358585357666, "learning_rate": 9.091598742930511e-07, "loss": 1.9978, "num_input_tokens_seen": 151212480, "step": 148500 }, { "epoch": 2.925715185761257, "grad_norm": 2.0041520595550537, "learning_rate": 9.090413345450179e-07, "loss": 1.9963, "num_input_tokens_seen": 151314568, "step": 148600 }, { "epoch": 2.9276840385107596, "grad_norm": 2.030916213989258, "learning_rate": 9.089227252429557e-07, "loss": 2.0069, "num_input_tokens_seen": 151416344, "step": 148700 }, { "epoch": 2.929652891260263, "grad_norm": 1.925169587135315, "learning_rate": 9.08804046407033e-07, "loss": 2.0032, "num_input_tokens_seen": 151518016, "step": 148800 }, { "epoch": 2.9316217440097656, "grad_norm": 1.727295994758606, "learning_rate": 9.086852980574303e-07, "loss": 1.9748, "num_input_tokens_seen": 151619704, "step": 148900 }, { "epoch": 2.9335905967592684, "grad_norm": 2.8354527950286865, "learning_rate": 9.085664802143397e-07, "loss": 1.9605, "num_input_tokens_seen": 151722104, "step": 149000 }, { "epoch": 2.935559449508771, "grad_norm": 1.894550085067749, "learning_rate": 9.084475928979652e-07, "loss": 1.9814, "num_input_tokens_seen": 151823880, "step": 149100 }, { "epoch": 2.937528302258274, "grad_norm": 1.8652126789093018, "learning_rate": 9.083286361285225e-07, "loss": 1.9943, "num_input_tokens_seen": 151925528, "step": 149200 }, { "epoch": 2.939497155007777, "grad_norm": 3.667811393737793, "learning_rate": 9.082096099262391e-07, "loss": 1.952, "num_input_tokens_seen": 152027928, "step": 149300 }, { "epoch": 2.94146600775728, "grad_norm": 1.7258117198944092, "learning_rate": 9.080905143113546e-07, "loss": 1.9856, "num_input_tokens_seen": 152128712, "step": 149400 }, { "epoch": 2.9434348605067826, "grad_norm": 2.0203278064727783, "learning_rate": 9.079713493041203e-07, "loss": 1.9416, "num_input_tokens_seen": 152231112, "step": 149500 }, { "epoch": 2.945403713256286, "grad_norm": 1.8034933805465698, "learning_rate": 9.078521149247989e-07, "loss": 1.925, "num_input_tokens_seen": 152332144, "step": 149600 }, { "epoch": 2.9473725660057886, "grad_norm": 2.9003779888153076, "learning_rate": 9.077328111936653e-07, "loss": 1.9684, "num_input_tokens_seen": 152433712, "step": 149700 }, { "epoch": 2.9493414187552913, "grad_norm": 1.8845500946044922, "learning_rate": 9.076134381310061e-07, "loss": 1.9728, "num_input_tokens_seen": 152536112, "step": 149800 }, { "epoch": 2.951310271504794, "grad_norm": 1.877603530883789, "learning_rate": 9.074939957571199e-07, "loss": 1.963, "num_input_tokens_seen": 152637648, "step": 149900 }, { "epoch": 2.953279124254297, "grad_norm": 2.0671799182891846, "learning_rate": 9.073744840923165e-07, "loss": 1.9787, "num_input_tokens_seen": 152740048, "step": 150000 }, { "epoch": 2.9552479770038, "grad_norm": 1.756220817565918, "learning_rate": 9.072549031569181e-07, "loss": 1.9894, "num_input_tokens_seen": 152841808, "step": 150100 }, { "epoch": 2.957216829753303, "grad_norm": 1.8161519765853882, "learning_rate": 9.071352529712584e-07, "loss": 1.9597, "num_input_tokens_seen": 152944208, "step": 150200 }, { "epoch": 2.9591856825028056, "grad_norm": 1.9361835718154907, "learning_rate": 9.070155335556828e-07, "loss": 1.9777, "num_input_tokens_seen": 153045376, "step": 150300 }, { "epoch": 2.9611545352523083, "grad_norm": 1.94983971118927, "learning_rate": 9.068957449305488e-07, "loss": 1.9811, "num_input_tokens_seen": 153147776, "step": 150400 }, { "epoch": 2.9631233880018115, "grad_norm": 2.0812742710113525, "learning_rate": 9.067758871162252e-07, "loss": 2.0175, "num_input_tokens_seen": 153248736, "step": 150500 }, { "epoch": 2.9650922407513143, "grad_norm": 1.9933220148086548, "learning_rate": 9.066559601330929e-07, "loss": 1.9907, "num_input_tokens_seen": 153351136, "step": 150600 }, { "epoch": 2.967061093500817, "grad_norm": 1.9672973155975342, "learning_rate": 9.065359640015444e-07, "loss": 2.0143, "num_input_tokens_seen": 153453464, "step": 150700 }, { "epoch": 2.96902994625032, "grad_norm": 1.8607351779937744, "learning_rate": 9.064158987419842e-07, "loss": 1.9579, "num_input_tokens_seen": 153555360, "step": 150800 }, { "epoch": 2.9709987989998226, "grad_norm": 2.0569875240325928, "learning_rate": 9.062957643748284e-07, "loss": 2.006, "num_input_tokens_seen": 153657760, "step": 150900 }, { "epoch": 2.9729676517493258, "grad_norm": 1.9158655405044556, "learning_rate": 9.061755609205044e-07, "loss": 1.999, "num_input_tokens_seen": 153757472, "step": 151000 }, { "epoch": 2.9749365044988285, "grad_norm": 1.9854087829589844, "learning_rate": 9.060552883994524e-07, "loss": 1.9399, "num_input_tokens_seen": 153858400, "step": 151100 }, { "epoch": 2.9769053572483313, "grad_norm": 2.4062297344207764, "learning_rate": 9.059349468321234e-07, "loss": 1.9781, "num_input_tokens_seen": 153960240, "step": 151200 }, { "epoch": 2.9788742099978345, "grad_norm": 1.714268445968628, "learning_rate": 9.058145362389804e-07, "loss": 1.9746, "num_input_tokens_seen": 154061192, "step": 151300 }, { "epoch": 2.9808430627473372, "grad_norm": 1.889648199081421, "learning_rate": 9.056940566404983e-07, "loss": 1.9691, "num_input_tokens_seen": 154162960, "step": 151400 }, { "epoch": 2.98281191549684, "grad_norm": 1.926221251487732, "learning_rate": 9.055735080571636e-07, "loss": 2.0362, "num_input_tokens_seen": 154263232, "step": 151500 }, { "epoch": 2.9847807682463428, "grad_norm": 1.9325400590896606, "learning_rate": 9.054528905094747e-07, "loss": 1.9895, "num_input_tokens_seen": 154363224, "step": 151600 }, { "epoch": 2.9867496209958455, "grad_norm": 1.9317102432250977, "learning_rate": 9.053322040179416e-07, "loss": 1.9955, "num_input_tokens_seen": 154464776, "step": 151700 }, { "epoch": 2.9887184737453487, "grad_norm": 2.054033041000366, "learning_rate": 9.052114486030859e-07, "loss": 1.966, "num_input_tokens_seen": 154566408, "step": 151800 }, { "epoch": 2.9906873264948515, "grad_norm": 1.9863126277923584, "learning_rate": 9.05090624285441e-07, "loss": 1.9901, "num_input_tokens_seen": 154667800, "step": 151900 }, { "epoch": 2.9926561792443542, "grad_norm": 1.8175255060195923, "learning_rate": 9.049697310855521e-07, "loss": 1.9483, "num_input_tokens_seen": 154770200, "step": 152000 }, { "epoch": 2.9946250319938574, "grad_norm": 2.620454788208008, "learning_rate": 9.048487690239763e-07, "loss": 1.9753, "num_input_tokens_seen": 154872600, "step": 152100 }, { "epoch": 2.99659388474336, "grad_norm": 2.014535665512085, "learning_rate": 9.047277381212819e-07, "loss": 2.0067, "num_input_tokens_seen": 154974128, "step": 152200 }, { "epoch": 2.998562737492863, "grad_norm": 1.8836442232131958, "learning_rate": 9.046066383980491e-07, "loss": 1.9883, "num_input_tokens_seen": 155074968, "step": 152300 }, { "epoch": 3.0005315902423657, "grad_norm": 1.7077946662902832, "learning_rate": 9.044854698748703e-07, "loss": 1.9731, "num_input_tokens_seen": 155175984, "step": 152400 }, { "epoch": 3.0025004429918685, "grad_norm": 1.916077733039856, "learning_rate": 9.043642325723488e-07, "loss": 1.9718, "num_input_tokens_seen": 155277776, "step": 152500 }, { "epoch": 3.0044692957413717, "grad_norm": 1.922825813293457, "learning_rate": 9.042429265111e-07, "loss": 1.9881, "num_input_tokens_seen": 155380176, "step": 152600 }, { "epoch": 3.0064381484908744, "grad_norm": 1.6958569288253784, "learning_rate": 9.041215517117513e-07, "loss": 1.9728, "num_input_tokens_seen": 155482576, "step": 152700 }, { "epoch": 3.008407001240377, "grad_norm": 2.1971304416656494, "learning_rate": 9.040001081949412e-07, "loss": 1.9748, "num_input_tokens_seen": 155584976, "step": 152800 }, { "epoch": 3.01037585398988, "grad_norm": 1.9326709508895874, "learning_rate": 9.038785959813201e-07, "loss": 1.9952, "num_input_tokens_seen": 155687376, "step": 152900 }, { "epoch": 3.012344706739383, "grad_norm": 1.6472011804580688, "learning_rate": 9.037570150915502e-07, "loss": 1.9616, "num_input_tokens_seen": 155789776, "step": 153000 }, { "epoch": 3.014313559488886, "grad_norm": 2.083277940750122, "learning_rate": 9.036353655463054e-07, "loss": 2.0036, "num_input_tokens_seen": 155892176, "step": 153100 }, { "epoch": 3.0162824122383887, "grad_norm": 1.840825080871582, "learning_rate": 9.035136473662712e-07, "loss": 1.9263, "num_input_tokens_seen": 155994096, "step": 153200 }, { "epoch": 3.0182512649878914, "grad_norm": 2.437803030014038, "learning_rate": 9.033918605721445e-07, "loss": 1.9592, "num_input_tokens_seen": 156096496, "step": 153300 }, { "epoch": 3.0202201177373946, "grad_norm": 1.7386488914489746, "learning_rate": 9.032700051846345e-07, "loss": 1.9663, "num_input_tokens_seen": 156198120, "step": 153400 }, { "epoch": 3.0221889704868974, "grad_norm": 2.4257898330688477, "learning_rate": 9.031480812244612e-07, "loss": 2.0058, "num_input_tokens_seen": 156298320, "step": 153500 }, { "epoch": 3.0241578232364, "grad_norm": 1.9683313369750977, "learning_rate": 9.030260887123571e-07, "loss": 1.943, "num_input_tokens_seen": 156399448, "step": 153600 }, { "epoch": 3.026126675985903, "grad_norm": 2.007336139678955, "learning_rate": 9.029040276690658e-07, "loss": 2.0275, "num_input_tokens_seen": 156500760, "step": 153700 }, { "epoch": 3.0280955287354057, "grad_norm": 1.9971493482589722, "learning_rate": 9.02781898115343e-07, "loss": 1.95, "num_input_tokens_seen": 156603160, "step": 153800 }, { "epoch": 3.030064381484909, "grad_norm": 1.8072985410690308, "learning_rate": 9.026597000719555e-07, "loss": 1.9866, "num_input_tokens_seen": 156705560, "step": 153900 }, { "epoch": 3.0320332342344116, "grad_norm": 2.3550922870635986, "learning_rate": 9.025374335596824e-07, "loss": 1.9878, "num_input_tokens_seen": 156807512, "step": 154000 }, { "epoch": 3.0340020869839144, "grad_norm": 1.8513967990875244, "learning_rate": 9.024150985993138e-07, "loss": 2.004, "num_input_tokens_seen": 156907672, "step": 154100 }, { "epoch": 3.035970939733417, "grad_norm": 1.9500596523284912, "learning_rate": 9.022926952116516e-07, "loss": 1.9728, "num_input_tokens_seen": 157010072, "step": 154200 }, { "epoch": 3.0379397924829203, "grad_norm": 1.7841788530349731, "learning_rate": 9.0217022341751e-07, "loss": 1.9871, "num_input_tokens_seen": 157112472, "step": 154300 }, { "epoch": 3.039908645232423, "grad_norm": 1.9122768640518188, "learning_rate": 9.020476832377137e-07, "loss": 1.9695, "num_input_tokens_seen": 157214872, "step": 154400 }, { "epoch": 3.041877497981926, "grad_norm": 1.908725619316101, "learning_rate": 9.019250746931e-07, "loss": 1.9509, "num_input_tokens_seen": 157316408, "step": 154500 }, { "epoch": 3.0438463507314286, "grad_norm": 2.1317882537841797, "learning_rate": 9.018023978045173e-07, "loss": 1.9725, "num_input_tokens_seen": 157418208, "step": 154600 }, { "epoch": 3.045815203480932, "grad_norm": 2.0081725120544434, "learning_rate": 9.016796525928258e-07, "loss": 1.989, "num_input_tokens_seen": 157519448, "step": 154700 }, { "epoch": 3.0477840562304346, "grad_norm": 1.9159324169158936, "learning_rate": 9.015568390788974e-07, "loss": 2.0155, "num_input_tokens_seen": 157620976, "step": 154800 }, { "epoch": 3.0497529089799373, "grad_norm": 3.625356435775757, "learning_rate": 9.014339572836153e-07, "loss": 2.0194, "num_input_tokens_seen": 157722760, "step": 154900 }, { "epoch": 3.05172176172944, "grad_norm": 2.53078293800354, "learning_rate": 9.013110072278747e-07, "loss": 1.9585, "num_input_tokens_seen": 157824016, "step": 155000 }, { "epoch": 3.0536906144789433, "grad_norm": 2.018456220626831, "learning_rate": 9.011879889325821e-07, "loss": 1.9959, "num_input_tokens_seen": 157925568, "step": 155100 }, { "epoch": 3.055659467228446, "grad_norm": 1.8994927406311035, "learning_rate": 9.010649024186558e-07, "loss": 2.0125, "num_input_tokens_seen": 158027968, "step": 155200 }, { "epoch": 3.057628319977949, "grad_norm": 1.9639719724655151, "learning_rate": 9.009417477070254e-07, "loss": 1.9846, "num_input_tokens_seen": 158129152, "step": 155300 }, { "epoch": 3.0595971727274516, "grad_norm": 2.702502727508545, "learning_rate": 9.008185248186328e-07, "loss": 1.9872, "num_input_tokens_seen": 158231552, "step": 155400 }, { "epoch": 3.0615660254769548, "grad_norm": 2.3195114135742188, "learning_rate": 9.006952337744304e-07, "loss": 1.9969, "num_input_tokens_seen": 158333304, "step": 155500 }, { "epoch": 3.0635348782264575, "grad_norm": 1.6101160049438477, "learning_rate": 9.005718745953833e-07, "loss": 1.9884, "num_input_tokens_seen": 158434848, "step": 155600 }, { "epoch": 3.0655037309759603, "grad_norm": 1.9197417497634888, "learning_rate": 9.004484473024675e-07, "loss": 1.9332, "num_input_tokens_seen": 158537248, "step": 155700 }, { "epoch": 3.067472583725463, "grad_norm": 1.8220990896224976, "learning_rate": 9.003249519166707e-07, "loss": 1.9378, "num_input_tokens_seen": 158639648, "step": 155800 }, { "epoch": 3.069441436474966, "grad_norm": 2.0467464923858643, "learning_rate": 9.002013884589924e-07, "loss": 2.0053, "num_input_tokens_seen": 158741496, "step": 155900 }, { "epoch": 3.071410289224469, "grad_norm": 1.7307876348495483, "learning_rate": 9.000777569504435e-07, "loss": 1.9448, "num_input_tokens_seen": 158843424, "step": 156000 }, { "epoch": 3.0733791419739718, "grad_norm": 1.9886528253555298, "learning_rate": 8.999540574120462e-07, "loss": 2.0004, "num_input_tokens_seen": 158945048, "step": 156100 }, { "epoch": 3.0753479947234745, "grad_norm": 1.8458962440490723, "learning_rate": 8.99830289864835e-07, "loss": 2.0059, "num_input_tokens_seen": 159046584, "step": 156200 }, { "epoch": 3.0773168474729773, "grad_norm": 2.0900425910949707, "learning_rate": 8.997064543298553e-07, "loss": 1.9767, "num_input_tokens_seen": 159148504, "step": 156300 }, { "epoch": 3.0792857002224805, "grad_norm": 2.185340404510498, "learning_rate": 8.995825508281643e-07, "loss": 1.9557, "num_input_tokens_seen": 159249696, "step": 156400 }, { "epoch": 3.0812545529719833, "grad_norm": 1.8409805297851562, "learning_rate": 8.994585793808308e-07, "loss": 1.9945, "num_input_tokens_seen": 159352096, "step": 156500 }, { "epoch": 3.083223405721486, "grad_norm": 1.7478899955749512, "learning_rate": 8.993345400089351e-07, "loss": 1.9673, "num_input_tokens_seen": 159454496, "step": 156600 }, { "epoch": 3.0851922584709888, "grad_norm": 1.7332470417022705, "learning_rate": 8.992104327335689e-07, "loss": 1.9692, "num_input_tokens_seen": 159556896, "step": 156700 }, { "epoch": 3.087161111220492, "grad_norm": 1.8158246278762817, "learning_rate": 8.990862575758358e-07, "loss": 2.0218, "num_input_tokens_seen": 159659296, "step": 156800 }, { "epoch": 3.0891299639699947, "grad_norm": 1.8258429765701294, "learning_rate": 8.989620145568506e-07, "loss": 1.9707, "num_input_tokens_seen": 159761696, "step": 156900 }, { "epoch": 3.0910988167194975, "grad_norm": 1.9211344718933105, "learning_rate": 8.988377036977398e-07, "loss": 2.0424, "num_input_tokens_seen": 159864096, "step": 157000 }, { "epoch": 3.0930676694690002, "grad_norm": 2.6851234436035156, "learning_rate": 8.987133250196414e-07, "loss": 1.9635, "num_input_tokens_seen": 159966496, "step": 157100 }, { "epoch": 3.0950365222185034, "grad_norm": 1.910126805305481, "learning_rate": 8.985888785437052e-07, "loss": 1.9844, "num_input_tokens_seen": 160067936, "step": 157200 }, { "epoch": 3.097005374968006, "grad_norm": 1.9952406883239746, "learning_rate": 8.984643642910918e-07, "loss": 1.9908, "num_input_tokens_seen": 160170336, "step": 157300 }, { "epoch": 3.098974227717509, "grad_norm": 1.881360411643982, "learning_rate": 8.983397822829744e-07, "loss": 1.9762, "num_input_tokens_seen": 160272736, "step": 157400 }, { "epoch": 3.1009430804670117, "grad_norm": 1.9617085456848145, "learning_rate": 8.982151325405365e-07, "loss": 1.984, "num_input_tokens_seen": 160375136, "step": 157500 }, { "epoch": 3.102911933216515, "grad_norm": 1.795054316520691, "learning_rate": 8.98090415084974e-07, "loss": 1.937, "num_input_tokens_seen": 160476704, "step": 157600 }, { "epoch": 3.1048807859660177, "grad_norm": 1.9739768505096436, "learning_rate": 8.979656299374941e-07, "loss": 2.0462, "num_input_tokens_seen": 160579104, "step": 157700 }, { "epoch": 3.1068496387155204, "grad_norm": 1.8118436336517334, "learning_rate": 8.978407771193155e-07, "loss": 1.9849, "num_input_tokens_seen": 160680800, "step": 157800 }, { "epoch": 3.108818491465023, "grad_norm": 2.1058084964752197, "learning_rate": 8.977158566516683e-07, "loss": 2.0085, "num_input_tokens_seen": 160782568, "step": 157900 }, { "epoch": 3.1107873442145264, "grad_norm": 1.942762017250061, "learning_rate": 8.97590868555794e-07, "loss": 2.0127, "num_input_tokens_seen": 160884968, "step": 158000 }, { "epoch": 3.112756196964029, "grad_norm": 2.611391305923462, "learning_rate": 8.974658128529461e-07, "loss": 1.9945, "num_input_tokens_seen": 160987368, "step": 158100 }, { "epoch": 3.114725049713532, "grad_norm": 1.9599195718765259, "learning_rate": 8.973406895643891e-07, "loss": 1.9623, "num_input_tokens_seen": 161089768, "step": 158200 }, { "epoch": 3.1166939024630347, "grad_norm": 1.860360026359558, "learning_rate": 8.972154987113991e-07, "loss": 1.9532, "num_input_tokens_seen": 161191304, "step": 158300 }, { "epoch": 3.1186627552125374, "grad_norm": 1.8410513401031494, "learning_rate": 8.970902403152638e-07, "loss": 1.9718, "num_input_tokens_seen": 161292304, "step": 158400 }, { "epoch": 3.1206316079620406, "grad_norm": 2.022204637527466, "learning_rate": 8.969649143972824e-07, "loss": 2.0167, "num_input_tokens_seen": 161394160, "step": 158500 }, { "epoch": 3.1226004607115434, "grad_norm": 1.9852044582366943, "learning_rate": 8.968395209787656e-07, "loss": 2.0541, "num_input_tokens_seen": 161496560, "step": 158600 }, { "epoch": 3.124569313461046, "grad_norm": 41.12730407714844, "learning_rate": 8.967140600810351e-07, "loss": 2.0817, "num_input_tokens_seen": 161598008, "step": 158700 }, { "epoch": 3.126538166210549, "grad_norm": 2.2703306674957275, "learning_rate": 8.965885317254249e-07, "loss": 2.0078, "num_input_tokens_seen": 161699800, "step": 158800 }, { "epoch": 3.128507018960052, "grad_norm": 2.369110345840454, "learning_rate": 8.964629359332797e-07, "loss": 1.9669, "num_input_tokens_seen": 161801704, "step": 158900 }, { "epoch": 3.130475871709555, "grad_norm": 2.0651791095733643, "learning_rate": 8.963372727259564e-07, "loss": 1.9613, "num_input_tokens_seen": 161903336, "step": 159000 }, { "epoch": 3.1324447244590576, "grad_norm": 1.8558191061019897, "learning_rate": 8.962115421248225e-07, "loss": 1.9656, "num_input_tokens_seen": 162005736, "step": 159100 }, { "epoch": 3.1344135772085604, "grad_norm": 2.227570056915283, "learning_rate": 8.960857441512578e-07, "loss": 1.9927, "num_input_tokens_seen": 162106912, "step": 159200 }, { "epoch": 3.1363824299580636, "grad_norm": 1.9104275703430176, "learning_rate": 8.95959878826653e-07, "loss": 2.0015, "num_input_tokens_seen": 162208520, "step": 159300 }, { "epoch": 3.1383512827075664, "grad_norm": 2.1558897495269775, "learning_rate": 8.958339461724104e-07, "loss": 1.9662, "num_input_tokens_seen": 162310920, "step": 159400 }, { "epoch": 3.140320135457069, "grad_norm": 1.891343593597412, "learning_rate": 8.957079462099442e-07, "loss": 1.9294, "num_input_tokens_seen": 162412504, "step": 159500 }, { "epoch": 3.142288988206572, "grad_norm": 1.9379512071609497, "learning_rate": 8.95581878960679e-07, "loss": 1.9676, "num_input_tokens_seen": 162514304, "step": 159600 }, { "epoch": 3.144257840956075, "grad_norm": 2.2635998725891113, "learning_rate": 8.954557444460518e-07, "loss": 2.025, "num_input_tokens_seen": 162615960, "step": 159700 }, { "epoch": 3.146226693705578, "grad_norm": 1.8451220989227295, "learning_rate": 8.953295426875107e-07, "loss": 1.9757, "num_input_tokens_seen": 162717824, "step": 159800 }, { "epoch": 3.1481955464550806, "grad_norm": 1.8659178018569946, "learning_rate": 8.952032737065153e-07, "loss": 1.9565, "num_input_tokens_seen": 162819704, "step": 159900 }, { "epoch": 3.1501643992045834, "grad_norm": 1.7440221309661865, "learning_rate": 8.950769375245364e-07, "loss": 1.9858, "num_input_tokens_seen": 162922104, "step": 160000 }, { "epoch": 3.1521332519540866, "grad_norm": 2.0307304859161377, "learning_rate": 8.949505341630566e-07, "loss": 2.0013, "num_input_tokens_seen": 163022320, "step": 160100 }, { "epoch": 3.1541021047035893, "grad_norm": 1.9746315479278564, "learning_rate": 8.948240636435695e-07, "loss": 1.9869, "num_input_tokens_seen": 163123584, "step": 160200 }, { "epoch": 3.156070957453092, "grad_norm": 1.9546113014221191, "learning_rate": 8.946975259875806e-07, "loss": 1.9716, "num_input_tokens_seen": 163225344, "step": 160300 }, { "epoch": 3.158039810202595, "grad_norm": 2.3259716033935547, "learning_rate": 8.945709212166066e-07, "loss": 1.9689, "num_input_tokens_seen": 163327008, "step": 160400 }, { "epoch": 3.160008662952098, "grad_norm": 1.8341962099075317, "learning_rate": 8.944442493521751e-07, "loss": 1.9864, "num_input_tokens_seen": 163429408, "step": 160500 }, { "epoch": 3.161977515701601, "grad_norm": 1.8844000101089478, "learning_rate": 8.94317510415826e-07, "loss": 2.0079, "num_input_tokens_seen": 163531808, "step": 160600 }, { "epoch": 3.1639463684511036, "grad_norm": 2.1464009284973145, "learning_rate": 8.941907044291102e-07, "loss": 2.0031, "num_input_tokens_seen": 163634208, "step": 160700 }, { "epoch": 3.1659152212006063, "grad_norm": 1.9537999629974365, "learning_rate": 8.940638314135898e-07, "loss": 2.0465, "num_input_tokens_seen": 163735848, "step": 160800 }, { "epoch": 3.167884073950109, "grad_norm": 1.8776942491531372, "learning_rate": 8.939368913908385e-07, "loss": 2.007, "num_input_tokens_seen": 163837680, "step": 160900 }, { "epoch": 3.1698529266996123, "grad_norm": 1.7506014108657837, "learning_rate": 8.938098843824415e-07, "loss": 1.9985, "num_input_tokens_seen": 163939272, "step": 161000 }, { "epoch": 3.171821779449115, "grad_norm": 2.0206222534179688, "learning_rate": 8.936828104099952e-07, "loss": 1.9931, "num_input_tokens_seen": 164040760, "step": 161100 }, { "epoch": 3.173790632198618, "grad_norm": 1.89096999168396, "learning_rate": 8.935556694951074e-07, "loss": 2.0344, "num_input_tokens_seen": 164142384, "step": 161200 }, { "epoch": 3.1757594849481205, "grad_norm": 1.959899663925171, "learning_rate": 8.934284616593975e-07, "loss": 1.9667, "num_input_tokens_seen": 164244784, "step": 161300 }, { "epoch": 3.1777283376976238, "grad_norm": 1.9966684579849243, "learning_rate": 8.933011869244959e-07, "loss": 1.9644, "num_input_tokens_seen": 164346480, "step": 161400 }, { "epoch": 3.1796971904471265, "grad_norm": 2.167038917541504, "learning_rate": 8.931738453120448e-07, "loss": 1.9888, "num_input_tokens_seen": 164448264, "step": 161500 }, { "epoch": 3.1816660431966293, "grad_norm": 1.9101742506027222, "learning_rate": 8.930464368436974e-07, "loss": 2.0127, "num_input_tokens_seen": 164550664, "step": 161600 }, { "epoch": 3.183634895946132, "grad_norm": 1.678812026977539, "learning_rate": 8.929189615411185e-07, "loss": 1.9547, "num_input_tokens_seen": 164652440, "step": 161700 }, { "epoch": 3.1856037486956352, "grad_norm": 1.9156702756881714, "learning_rate": 8.927914194259845e-07, "loss": 1.9994, "num_input_tokens_seen": 164754008, "step": 161800 }, { "epoch": 3.187572601445138, "grad_norm": 2.3595893383026123, "learning_rate": 8.926638105199824e-07, "loss": 1.9736, "num_input_tokens_seen": 164854520, "step": 161900 }, { "epoch": 3.1895414541946407, "grad_norm": 2.0386016368865967, "learning_rate": 8.925361348448112e-07, "loss": 1.9764, "num_input_tokens_seen": 164956552, "step": 162000 }, { "epoch": 3.1915103069441435, "grad_norm": 2.2614169120788574, "learning_rate": 8.92408392422181e-07, "loss": 1.9695, "num_input_tokens_seen": 165058504, "step": 162100 }, { "epoch": 3.1934791596936467, "grad_norm": 1.8114501237869263, "learning_rate": 8.922805832738133e-07, "loss": 1.9296, "num_input_tokens_seen": 165160904, "step": 162200 }, { "epoch": 3.1954480124431495, "grad_norm": 2.0567026138305664, "learning_rate": 8.921527074214414e-07, "loss": 1.9708, "num_input_tokens_seen": 165262840, "step": 162300 }, { "epoch": 3.1974168651926522, "grad_norm": 1.8833788633346558, "learning_rate": 8.92024764886809e-07, "loss": 1.9825, "num_input_tokens_seen": 165364128, "step": 162400 }, { "epoch": 3.199385717942155, "grad_norm": 2.330120801925659, "learning_rate": 8.918967556916719e-07, "loss": 1.9494, "num_input_tokens_seen": 165465896, "step": 162500 }, { "epoch": 3.2013545706916577, "grad_norm": 2.012997627258301, "learning_rate": 8.917686798577968e-07, "loss": 1.9949, "num_input_tokens_seen": 165568296, "step": 162600 }, { "epoch": 3.203323423441161, "grad_norm": 1.964740514755249, "learning_rate": 8.91640537406962e-07, "loss": 1.9576, "num_input_tokens_seen": 165669936, "step": 162700 }, { "epoch": 3.2052922761906637, "grad_norm": 1.7610605955123901, "learning_rate": 8.915123283609572e-07, "loss": 2.018, "num_input_tokens_seen": 165771440, "step": 162800 }, { "epoch": 3.2072611289401665, "grad_norm": 2.0105230808258057, "learning_rate": 8.91384052741583e-07, "loss": 1.9705, "num_input_tokens_seen": 165873840, "step": 162900 }, { "epoch": 3.2092299816896697, "grad_norm": 1.9277650117874146, "learning_rate": 8.912557105706517e-07, "loss": 1.9828, "num_input_tokens_seen": 165975416, "step": 163000 }, { "epoch": 3.2111988344391724, "grad_norm": 1.7110425233840942, "learning_rate": 8.911273018699869e-07, "loss": 1.999, "num_input_tokens_seen": 166075720, "step": 163100 }, { "epoch": 3.213167687188675, "grad_norm": 1.8858184814453125, "learning_rate": 8.909988266614233e-07, "loss": 1.9918, "num_input_tokens_seen": 166176728, "step": 163200 }, { "epoch": 3.215136539938178, "grad_norm": 2.62264084815979, "learning_rate": 8.90870284966807e-07, "loss": 2.0197, "num_input_tokens_seen": 166279128, "step": 163300 }, { "epoch": 3.2171053926876807, "grad_norm": 1.7170372009277344, "learning_rate": 8.907416768079956e-07, "loss": 1.9746, "num_input_tokens_seen": 166379728, "step": 163400 }, { "epoch": 3.219074245437184, "grad_norm": 1.7519105672836304, "learning_rate": 8.906130022068577e-07, "loss": 2.0108, "num_input_tokens_seen": 166482128, "step": 163500 }, { "epoch": 3.2210430981866867, "grad_norm": 1.9250478744506836, "learning_rate": 8.904842611852734e-07, "loss": 2.0074, "num_input_tokens_seen": 166583688, "step": 163600 }, { "epoch": 3.2230119509361894, "grad_norm": 2.1288650035858154, "learning_rate": 8.903554537651339e-07, "loss": 1.9904, "num_input_tokens_seen": 166686088, "step": 163700 }, { "epoch": 3.224980803685692, "grad_norm": 2.430946111679077, "learning_rate": 8.90226579968342e-07, "loss": 1.951, "num_input_tokens_seen": 166788488, "step": 163800 }, { "epoch": 3.2269496564351954, "grad_norm": 2.035384178161621, "learning_rate": 8.900976398168113e-07, "loss": 1.9685, "num_input_tokens_seen": 166889480, "step": 163900 }, { "epoch": 3.228918509184698, "grad_norm": 1.9620369672775269, "learning_rate": 8.899686333324674e-07, "loss": 2.015, "num_input_tokens_seen": 166991232, "step": 164000 }, { "epoch": 3.230887361934201, "grad_norm": 1.7356258630752563, "learning_rate": 8.898395605372464e-07, "loss": 2.0182, "num_input_tokens_seen": 167092816, "step": 164100 }, { "epoch": 3.2328562146837037, "grad_norm": 1.9568620920181274, "learning_rate": 8.897104214530962e-07, "loss": 1.9717, "num_input_tokens_seen": 167193408, "step": 164200 }, { "epoch": 3.234825067433207, "grad_norm": 2.3432846069335938, "learning_rate": 8.895812161019759e-07, "loss": 2.0157, "num_input_tokens_seen": 167295312, "step": 164300 }, { "epoch": 3.2367939201827096, "grad_norm": 1.7052772045135498, "learning_rate": 8.894519445058556e-07, "loss": 1.922, "num_input_tokens_seen": 167397088, "step": 164400 }, { "epoch": 3.2387627729322124, "grad_norm": 1.9668554067611694, "learning_rate": 8.89322606686717e-07, "loss": 1.9658, "num_input_tokens_seen": 167498976, "step": 164500 }, { "epoch": 3.240731625681715, "grad_norm": 1.9469531774520874, "learning_rate": 8.891932026665528e-07, "loss": 1.9703, "num_input_tokens_seen": 167601376, "step": 164600 }, { "epoch": 3.2427004784312183, "grad_norm": 1.912940263748169, "learning_rate": 8.890637324673671e-07, "loss": 1.9732, "num_input_tokens_seen": 167703768, "step": 164700 }, { "epoch": 3.244669331180721, "grad_norm": 1.7971786260604858, "learning_rate": 8.889341961111751e-07, "loss": 2.0166, "num_input_tokens_seen": 167806168, "step": 164800 }, { "epoch": 3.246638183930224, "grad_norm": 1.7801483869552612, "learning_rate": 8.888045936200036e-07, "loss": 1.9954, "num_input_tokens_seen": 167908568, "step": 164900 }, { "epoch": 3.2486070366797266, "grad_norm": 2.1926114559173584, "learning_rate": 8.886749250158903e-07, "loss": 2.0289, "num_input_tokens_seen": 168010376, "step": 165000 }, { "epoch": 3.2505758894292294, "grad_norm": 1.8983944654464722, "learning_rate": 8.885451903208841e-07, "loss": 1.9615, "num_input_tokens_seen": 168111576, "step": 165100 }, { "epoch": 3.2525447421787326, "grad_norm": 2.0520403385162354, "learning_rate": 8.884153895570456e-07, "loss": 1.9826, "num_input_tokens_seen": 168213104, "step": 165200 }, { "epoch": 3.2545135949282353, "grad_norm": 1.9233990907669067, "learning_rate": 8.882855227464461e-07, "loss": 1.9622, "num_input_tokens_seen": 168314768, "step": 165300 }, { "epoch": 3.256482447677738, "grad_norm": 1.8786237239837646, "learning_rate": 8.881555899111683e-07, "loss": 1.9824, "num_input_tokens_seen": 168415584, "step": 165400 }, { "epoch": 3.2584513004272413, "grad_norm": 1.89914870262146, "learning_rate": 8.880255910733063e-07, "loss": 1.9423, "num_input_tokens_seen": 168517064, "step": 165500 }, { "epoch": 3.260420153176744, "grad_norm": 1.8395462036132812, "learning_rate": 8.878955262549655e-07, "loss": 1.9966, "num_input_tokens_seen": 168618720, "step": 165600 }, { "epoch": 3.262389005926247, "grad_norm": 2.196533679962158, "learning_rate": 8.877653954782619e-07, "loss": 1.9929, "num_input_tokens_seen": 168721120, "step": 165700 }, { "epoch": 3.2643578586757496, "grad_norm": 1.9165867567062378, "learning_rate": 8.876351987653235e-07, "loss": 1.9942, "num_input_tokens_seen": 168822248, "step": 165800 }, { "epoch": 3.2663267114252523, "grad_norm": 1.9275126457214355, "learning_rate": 8.875049361382889e-07, "loss": 1.9777, "num_input_tokens_seen": 168923696, "step": 165900 }, { "epoch": 3.2682955641747555, "grad_norm": 1.802634596824646, "learning_rate": 8.873746076193084e-07, "loss": 1.9761, "num_input_tokens_seen": 169024496, "step": 166000 }, { "epoch": 3.2702644169242583, "grad_norm": 2.0053305625915527, "learning_rate": 8.872442132305431e-07, "loss": 2.0066, "num_input_tokens_seen": 169126304, "step": 166100 }, { "epoch": 3.272233269673761, "grad_norm": 1.9875487089157104, "learning_rate": 8.871137529941655e-07, "loss": 1.967, "num_input_tokens_seen": 169226296, "step": 166200 }, { "epoch": 3.274202122423264, "grad_norm": 1.7244879007339478, "learning_rate": 8.869832269323592e-07, "loss": 1.9969, "num_input_tokens_seen": 169327776, "step": 166300 }, { "epoch": 3.276170975172767, "grad_norm": 1.9510544538497925, "learning_rate": 8.868526350673191e-07, "loss": 1.9694, "num_input_tokens_seen": 169430176, "step": 166400 }, { "epoch": 3.2781398279222698, "grad_norm": 1.7385942935943604, "learning_rate": 8.867219774212514e-07, "loss": 1.959, "num_input_tokens_seen": 169532488, "step": 166500 }, { "epoch": 3.2801086806717725, "grad_norm": 1.9729104042053223, "learning_rate": 8.865912540163731e-07, "loss": 2.009, "num_input_tokens_seen": 169634888, "step": 166600 }, { "epoch": 3.2820775334212753, "grad_norm": 2.0269644260406494, "learning_rate": 8.864604648749129e-07, "loss": 1.9963, "num_input_tokens_seen": 169737288, "step": 166700 }, { "epoch": 3.284046386170778, "grad_norm": 2.124392032623291, "learning_rate": 8.863296100191102e-07, "loss": 1.9577, "num_input_tokens_seen": 169839688, "step": 166800 }, { "epoch": 3.2860152389202812, "grad_norm": 2.3548800945281982, "learning_rate": 8.861986894712155e-07, "loss": 2.025, "num_input_tokens_seen": 169942088, "step": 166900 }, { "epoch": 3.287984091669784, "grad_norm": 1.7009906768798828, "learning_rate": 8.860677032534913e-07, "loss": 1.9597, "num_input_tokens_seen": 170044488, "step": 167000 }, { "epoch": 3.2899529444192868, "grad_norm": 2.1477997303009033, "learning_rate": 8.859366513882103e-07, "loss": 1.9842, "num_input_tokens_seen": 170146368, "step": 167100 }, { "epoch": 3.29192179716879, "grad_norm": 2.09075665473938, "learning_rate": 8.858055338976566e-07, "loss": 1.9543, "num_input_tokens_seen": 170247912, "step": 167200 }, { "epoch": 3.2938906499182927, "grad_norm": 1.9375501871109009, "learning_rate": 8.856743508041262e-07, "loss": 2.0287, "num_input_tokens_seen": 170349456, "step": 167300 }, { "epoch": 3.2958595026677955, "grad_norm": 2.09952974319458, "learning_rate": 8.855431021299253e-07, "loss": 1.9911, "num_input_tokens_seen": 170451856, "step": 167400 }, { "epoch": 3.2978283554172982, "grad_norm": 1.9498170614242554, "learning_rate": 8.854117878973718e-07, "loss": 2.0775, "num_input_tokens_seen": 170553328, "step": 167500 }, { "epoch": 3.299797208166801, "grad_norm": 2.5795865058898926, "learning_rate": 8.852804081287943e-07, "loss": 2.0236, "num_input_tokens_seen": 170655680, "step": 167600 }, { "epoch": 3.301766060916304, "grad_norm": 1.757726788520813, "learning_rate": 8.851489628465331e-07, "loss": 2.0025, "num_input_tokens_seen": 170757408, "step": 167700 }, { "epoch": 3.303734913665807, "grad_norm": 1.8712294101715088, "learning_rate": 8.850174520729395e-07, "loss": 2.0344, "num_input_tokens_seen": 170858144, "step": 167800 }, { "epoch": 3.3057037664153097, "grad_norm": 2.155019521713257, "learning_rate": 8.848858758303752e-07, "loss": 1.9832, "num_input_tokens_seen": 170960232, "step": 167900 }, { "epoch": 3.3076726191648125, "grad_norm": 1.9364187717437744, "learning_rate": 8.84754234141214e-07, "loss": 1.9534, "num_input_tokens_seen": 171062632, "step": 168000 }, { "epoch": 3.3096414719143157, "grad_norm": 1.8489073514938354, "learning_rate": 8.846225270278406e-07, "loss": 1.9614, "num_input_tokens_seen": 171165032, "step": 168100 }, { "epoch": 3.3116103246638184, "grad_norm": 1.943538784980774, "learning_rate": 8.844907545126506e-07, "loss": 1.9851, "num_input_tokens_seen": 171267144, "step": 168200 }, { "epoch": 3.313579177413321, "grad_norm": 1.8676447868347168, "learning_rate": 8.843589166180506e-07, "loss": 1.9712, "num_input_tokens_seen": 171369544, "step": 168300 }, { "epoch": 3.315548030162824, "grad_norm": 1.9416911602020264, "learning_rate": 8.842270133664586e-07, "loss": 1.9966, "num_input_tokens_seen": 171470576, "step": 168400 }, { "epoch": 3.317516882912327, "grad_norm": 1.9778763055801392, "learning_rate": 8.84095044780304e-07, "loss": 1.9937, "num_input_tokens_seen": 171572976, "step": 168500 }, { "epoch": 3.31948573566183, "grad_norm": 2.098393678665161, "learning_rate": 8.839630108820264e-07, "loss": 2.0208, "num_input_tokens_seen": 171673944, "step": 168600 }, { "epoch": 3.3214545884113327, "grad_norm": 2.0689074993133545, "learning_rate": 8.838309116940773e-07, "loss": 1.9673, "num_input_tokens_seen": 171776344, "step": 168700 }, { "epoch": 3.3234234411608354, "grad_norm": 2.3261332511901855, "learning_rate": 8.83698747238919e-07, "loss": 1.9714, "num_input_tokens_seen": 171878744, "step": 168800 }, { "epoch": 3.3253922939103386, "grad_norm": 1.6794986724853516, "learning_rate": 8.835665175390252e-07, "loss": 2.0109, "num_input_tokens_seen": 171980688, "step": 168900 }, { "epoch": 3.3273611466598414, "grad_norm": 1.9332787990570068, "learning_rate": 8.8343422261688e-07, "loss": 1.9729, "num_input_tokens_seen": 172083088, "step": 169000 }, { "epoch": 3.329329999409344, "grad_norm": 1.6862245798110962, "learning_rate": 8.833018624949795e-07, "loss": 2.0067, "num_input_tokens_seen": 172185488, "step": 169100 }, { "epoch": 3.331298852158847, "grad_norm": 1.9138447046279907, "learning_rate": 8.831694371958302e-07, "loss": 2.0186, "num_input_tokens_seen": 172285832, "step": 169200 }, { "epoch": 3.3332677049083497, "grad_norm": 1.9838453531265259, "learning_rate": 8.830369467419497e-07, "loss": 1.9824, "num_input_tokens_seen": 172388232, "step": 169300 }, { "epoch": 3.335236557657853, "grad_norm": 2.972688674926758, "learning_rate": 8.829043911558671e-07, "loss": 2.0135, "num_input_tokens_seen": 172489536, "step": 169400 }, { "epoch": 3.3372054104073556, "grad_norm": 1.8738408088684082, "learning_rate": 8.827717704601225e-07, "loss": 1.9755, "num_input_tokens_seen": 172590632, "step": 169500 }, { "epoch": 3.3391742631568584, "grad_norm": 1.9847636222839355, "learning_rate": 8.826390846772668e-07, "loss": 1.938, "num_input_tokens_seen": 172691736, "step": 169600 }, { "epoch": 3.3411431159063616, "grad_norm": 1.9578179121017456, "learning_rate": 8.825063338298619e-07, "loss": 1.9784, "num_input_tokens_seen": 172793368, "step": 169700 }, { "epoch": 3.3431119686558644, "grad_norm": 1.7819938659667969, "learning_rate": 8.823735179404812e-07, "loss": 1.9772, "num_input_tokens_seen": 172894400, "step": 169800 }, { "epoch": 3.345080821405367, "grad_norm": 2.017322301864624, "learning_rate": 8.82240637031709e-07, "loss": 1.9745, "num_input_tokens_seen": 172996800, "step": 169900 }, { "epoch": 3.34704967415487, "grad_norm": 2.1448822021484375, "learning_rate": 8.821076911261403e-07, "loss": 1.9564, "num_input_tokens_seen": 173099200, "step": 170000 }, { "epoch": 3.3490185269043726, "grad_norm": 1.874789834022522, "learning_rate": 8.819746802463817e-07, "loss": 1.9886, "num_input_tokens_seen": 173200976, "step": 170100 }, { "epoch": 3.350987379653876, "grad_norm": 2.202587604522705, "learning_rate": 8.818416044150503e-07, "loss": 1.9684, "num_input_tokens_seen": 173303376, "step": 170200 }, { "epoch": 3.3529562324033786, "grad_norm": 2.053558826446533, "learning_rate": 8.817084636547747e-07, "loss": 1.9877, "num_input_tokens_seen": 173404800, "step": 170300 }, { "epoch": 3.3549250851528813, "grad_norm": 1.969000220298767, "learning_rate": 8.815752579881944e-07, "loss": 1.9827, "num_input_tokens_seen": 173507200, "step": 170400 }, { "epoch": 3.356893937902384, "grad_norm": 1.8853901624679565, "learning_rate": 8.814419874379598e-07, "loss": 1.9758, "num_input_tokens_seen": 173608488, "step": 170500 }, { "epoch": 3.3588627906518873, "grad_norm": 5.5333380699157715, "learning_rate": 8.813086520267324e-07, "loss": 1.9703, "num_input_tokens_seen": 173708024, "step": 170600 }, { "epoch": 3.36083164340139, "grad_norm": 2.043829917907715, "learning_rate": 8.811752517771848e-07, "loss": 1.9842, "num_input_tokens_seen": 173810064, "step": 170700 }, { "epoch": 3.362800496150893, "grad_norm": 1.8781522512435913, "learning_rate": 8.810417867120008e-07, "loss": 1.9513, "num_input_tokens_seen": 173912464, "step": 170800 }, { "epoch": 3.3647693489003956, "grad_norm": 3.270911693572998, "learning_rate": 8.809082568538748e-07, "loss": 2.0031, "num_input_tokens_seen": 174014216, "step": 170900 }, { "epoch": 3.366738201649899, "grad_norm": 2.1133625507354736, "learning_rate": 8.807746622255123e-07, "loss": 1.9944, "num_input_tokens_seen": 174116616, "step": 171000 }, { "epoch": 3.3687070543994015, "grad_norm": 1.9044172763824463, "learning_rate": 8.806410028496302e-07, "loss": 1.9605, "num_input_tokens_seen": 174219008, "step": 171100 }, { "epoch": 3.3706759071489043, "grad_norm": 1.9099133014678955, "learning_rate": 8.805072787489561e-07, "loss": 2.0292, "num_input_tokens_seen": 174320560, "step": 171200 }, { "epoch": 3.372644759898407, "grad_norm": 2.0432288646698, "learning_rate": 8.803734899462285e-07, "loss": 2.0272, "num_input_tokens_seen": 174421456, "step": 171300 }, { "epoch": 3.3746136126479103, "grad_norm": 1.81580650806427, "learning_rate": 8.802396364641971e-07, "loss": 2.0202, "num_input_tokens_seen": 174523856, "step": 171400 }, { "epoch": 3.376582465397413, "grad_norm": 2.0548923015594482, "learning_rate": 8.801057183256226e-07, "loss": 1.9989, "num_input_tokens_seen": 174626256, "step": 171500 }, { "epoch": 3.378551318146916, "grad_norm": 1.7348370552062988, "learning_rate": 8.799717355532767e-07, "loss": 1.9893, "num_input_tokens_seen": 174726680, "step": 171600 }, { "epoch": 3.3805201708964185, "grad_norm": 1.9614585638046265, "learning_rate": 8.798376881699421e-07, "loss": 1.9935, "num_input_tokens_seen": 174827712, "step": 171700 }, { "epoch": 3.3824890236459213, "grad_norm": 2.0643503665924072, "learning_rate": 8.797035761984123e-07, "loss": 1.9866, "num_input_tokens_seen": 174930112, "step": 171800 }, { "epoch": 3.3844578763954245, "grad_norm": 1.7682868242263794, "learning_rate": 8.795693996614919e-07, "loss": 1.9844, "num_input_tokens_seen": 175031400, "step": 171900 }, { "epoch": 3.3864267291449273, "grad_norm": 2.0654399394989014, "learning_rate": 8.794351585819965e-07, "loss": 2.0195, "num_input_tokens_seen": 175133800, "step": 172000 }, { "epoch": 3.38839558189443, "grad_norm": 1.854973316192627, "learning_rate": 8.793008529827527e-07, "loss": 1.9906, "num_input_tokens_seen": 175234848, "step": 172100 }, { "epoch": 3.3903644346439332, "grad_norm": 1.7924410104751587, "learning_rate": 8.79166482886598e-07, "loss": 1.9701, "num_input_tokens_seen": 175337248, "step": 172200 }, { "epoch": 3.392333287393436, "grad_norm": 1.903087854385376, "learning_rate": 8.790320483163809e-07, "loss": 1.987, "num_input_tokens_seen": 175438224, "step": 172300 }, { "epoch": 3.3943021401429387, "grad_norm": 1.9773424863815308, "learning_rate": 8.78897549294961e-07, "loss": 2.0261, "num_input_tokens_seen": 175539688, "step": 172400 }, { "epoch": 3.3962709928924415, "grad_norm": 2.204087972640991, "learning_rate": 8.787629858452085e-07, "loss": 2.0154, "num_input_tokens_seen": 175640208, "step": 172500 }, { "epoch": 3.3982398456419443, "grad_norm": 1.6858693361282349, "learning_rate": 8.786283579900049e-07, "loss": 1.982, "num_input_tokens_seen": 175742608, "step": 172600 }, { "epoch": 3.4002086983914475, "grad_norm": 2.1794304847717285, "learning_rate": 8.784936657522426e-07, "loss": 1.9456, "num_input_tokens_seen": 175844480, "step": 172700 }, { "epoch": 3.40217755114095, "grad_norm": 1.7944830656051636, "learning_rate": 8.783589091548247e-07, "loss": 1.9911, "num_input_tokens_seen": 175946008, "step": 172800 }, { "epoch": 3.404146403890453, "grad_norm": 1.7953822612762451, "learning_rate": 8.782240882206655e-07, "loss": 1.9776, "num_input_tokens_seen": 176048408, "step": 172900 }, { "epoch": 3.4061152566399557, "grad_norm": 2.0612521171569824, "learning_rate": 8.780892029726904e-07, "loss": 2.0142, "num_input_tokens_seen": 176149992, "step": 173000 }, { "epoch": 3.408084109389459, "grad_norm": 1.9655766487121582, "learning_rate": 8.779542534338351e-07, "loss": 1.959, "num_input_tokens_seen": 176251648, "step": 173100 }, { "epoch": 3.4100529621389617, "grad_norm": 1.9699556827545166, "learning_rate": 8.77819239627047e-07, "loss": 2.0098, "num_input_tokens_seen": 176351816, "step": 173200 }, { "epoch": 3.4120218148884645, "grad_norm": 1.851715087890625, "learning_rate": 8.776841615752837e-07, "loss": 1.9588, "num_input_tokens_seen": 176453520, "step": 173300 }, { "epoch": 3.413990667637967, "grad_norm": 2.1681926250457764, "learning_rate": 8.775490193015143e-07, "loss": 2.0074, "num_input_tokens_seen": 176555920, "step": 173400 }, { "epoch": 3.41595952038747, "grad_norm": 1.6440446376800537, "learning_rate": 8.774138128287185e-07, "loss": 1.9695, "num_input_tokens_seen": 176658320, "step": 173500 }, { "epoch": 3.417928373136973, "grad_norm": 2.0168633460998535, "learning_rate": 8.772785421798872e-07, "loss": 1.926, "num_input_tokens_seen": 176760720, "step": 173600 }, { "epoch": 3.419897225886476, "grad_norm": 1.994805932044983, "learning_rate": 8.771432073780219e-07, "loss": 1.9823, "num_input_tokens_seen": 176862416, "step": 173700 }, { "epoch": 3.4218660786359787, "grad_norm": 1.8778208494186401, "learning_rate": 8.770078084461351e-07, "loss": 1.9637, "num_input_tokens_seen": 176964816, "step": 173800 }, { "epoch": 3.423834931385482, "grad_norm": 2.268155574798584, "learning_rate": 8.768723454072502e-07, "loss": 1.9809, "num_input_tokens_seen": 177066648, "step": 173900 }, { "epoch": 3.4258037841349847, "grad_norm": 1.6476994752883911, "learning_rate": 8.767368182844018e-07, "loss": 2.001, "num_input_tokens_seen": 177169048, "step": 174000 }, { "epoch": 3.4277726368844874, "grad_norm": 1.9171645641326904, "learning_rate": 8.766012271006351e-07, "loss": 1.9709, "num_input_tokens_seen": 177270760, "step": 174100 }, { "epoch": 3.42974148963399, "grad_norm": 1.7686083316802979, "learning_rate": 8.764655718790059e-07, "loss": 2.039, "num_input_tokens_seen": 177373160, "step": 174200 }, { "epoch": 3.431710342383493, "grad_norm": 2.0800998210906982, "learning_rate": 8.763298526425816e-07, "loss": 2.0096, "num_input_tokens_seen": 177475128, "step": 174300 }, { "epoch": 3.433679195132996, "grad_norm": 1.8019814491271973, "learning_rate": 8.761940694144399e-07, "loss": 1.9807, "num_input_tokens_seen": 177577528, "step": 174400 }, { "epoch": 3.435648047882499, "grad_norm": 2.1396100521087646, "learning_rate": 8.760582222176698e-07, "loss": 1.9268, "num_input_tokens_seen": 177679928, "step": 174500 }, { "epoch": 3.4376169006320016, "grad_norm": 2.138073205947876, "learning_rate": 8.759223110753708e-07, "loss": 1.9931, "num_input_tokens_seen": 177782328, "step": 174600 }, { "epoch": 3.4395857533815044, "grad_norm": 2.0238165855407715, "learning_rate": 8.757863360106534e-07, "loss": 2.0324, "num_input_tokens_seen": 177884728, "step": 174700 }, { "epoch": 3.4415546061310076, "grad_norm": 1.9930150508880615, "learning_rate": 8.756502970466391e-07, "loss": 2.0009, "num_input_tokens_seen": 177987128, "step": 174800 }, { "epoch": 3.4435234588805104, "grad_norm": 1.9081519842147827, "learning_rate": 8.755141942064601e-07, "loss": 1.937, "num_input_tokens_seen": 178088752, "step": 174900 }, { "epoch": 3.445492311630013, "grad_norm": 1.9462133646011353, "learning_rate": 8.753780275132598e-07, "loss": 1.9945, "num_input_tokens_seen": 178191152, "step": 175000 }, { "epoch": 3.447461164379516, "grad_norm": 1.8115335702896118, "learning_rate": 8.752417969901918e-07, "loss": 2.0213, "num_input_tokens_seen": 178292944, "step": 175100 }, { "epoch": 3.449430017129019, "grad_norm": 1.675045132637024, "learning_rate": 8.751055026604213e-07, "loss": 2.0624, "num_input_tokens_seen": 178394768, "step": 175200 }, { "epoch": 3.451398869878522, "grad_norm": 2.788806915283203, "learning_rate": 8.749691445471236e-07, "loss": 2.0736, "num_input_tokens_seen": 178496768, "step": 175300 }, { "epoch": 3.4533677226280246, "grad_norm": 1.9393644332885742, "learning_rate": 8.748327226734858e-07, "loss": 1.9192, "num_input_tokens_seen": 178598128, "step": 175400 }, { "epoch": 3.4553365753775274, "grad_norm": 2.0350983142852783, "learning_rate": 8.746962370627049e-07, "loss": 1.9605, "num_input_tokens_seen": 178699840, "step": 175500 }, { "epoch": 3.4573054281270306, "grad_norm": 1.8694217205047607, "learning_rate": 8.745596877379893e-07, "loss": 2.0159, "num_input_tokens_seen": 178800144, "step": 175600 }, { "epoch": 3.4592742808765333, "grad_norm": 1.9558669328689575, "learning_rate": 8.744230747225579e-07, "loss": 1.9723, "num_input_tokens_seen": 178902544, "step": 175700 }, { "epoch": 3.461243133626036, "grad_norm": 1.9053242206573486, "learning_rate": 8.742863980396407e-07, "loss": 1.9882, "num_input_tokens_seen": 179004304, "step": 175800 }, { "epoch": 3.463211986375539, "grad_norm": 1.9949630498886108, "learning_rate": 8.741496577124783e-07, "loss": 2.0504, "num_input_tokens_seen": 179106704, "step": 175900 }, { "epoch": 3.4651808391250416, "grad_norm": 1.9565402269363403, "learning_rate": 8.740128537643225e-07, "loss": 2.0043, "num_input_tokens_seen": 179209104, "step": 176000 }, { "epoch": 3.467149691874545, "grad_norm": 1.7130528688430786, "learning_rate": 8.738759862184353e-07, "loss": 1.9861, "num_input_tokens_seen": 179310104, "step": 176100 }, { "epoch": 3.4691185446240476, "grad_norm": 1.7138453722000122, "learning_rate": 8.737390550980901e-07, "loss": 1.9759, "num_input_tokens_seen": 179411736, "step": 176200 }, { "epoch": 3.4710873973735503, "grad_norm": 1.835710883140564, "learning_rate": 8.73602060426571e-07, "loss": 1.9581, "num_input_tokens_seen": 179514136, "step": 176300 }, { "epoch": 3.4730562501230535, "grad_norm": 5.932350158691406, "learning_rate": 8.734650022271725e-07, "loss": 1.945, "num_input_tokens_seen": 179616536, "step": 176400 }, { "epoch": 3.4750251028725563, "grad_norm": 1.84043550491333, "learning_rate": 8.733278805232003e-07, "loss": 1.9158, "num_input_tokens_seen": 179718936, "step": 176500 }, { "epoch": 3.476993955622059, "grad_norm": 2.0021297931671143, "learning_rate": 8.731906953379709e-07, "loss": 1.9646, "num_input_tokens_seen": 179821336, "step": 176600 }, { "epoch": 3.478962808371562, "grad_norm": 2.124436378479004, "learning_rate": 8.730534466948113e-07, "loss": 1.9504, "num_input_tokens_seen": 179922088, "step": 176700 }, { "epoch": 3.4809316611210646, "grad_norm": 2.1138103008270264, "learning_rate": 8.729161346170596e-07, "loss": 1.9762, "num_input_tokens_seen": 180024488, "step": 176800 }, { "epoch": 3.4829005138705678, "grad_norm": 2.0620594024658203, "learning_rate": 8.727787591280646e-07, "loss": 2.0408, "num_input_tokens_seen": 180126888, "step": 176900 }, { "epoch": 3.4848693666200705, "grad_norm": 1.9161667823791504, "learning_rate": 8.726413202511856e-07, "loss": 1.9324, "num_input_tokens_seen": 180228744, "step": 177000 }, { "epoch": 3.4868382193695733, "grad_norm": 1.696247935295105, "learning_rate": 8.725038180097932e-07, "loss": 1.9831, "num_input_tokens_seen": 180330384, "step": 177100 }, { "epoch": 3.488807072119076, "grad_norm": 1.9214447736740112, "learning_rate": 8.723662524272684e-07, "loss": 1.9566, "num_input_tokens_seen": 180432784, "step": 177200 }, { "epoch": 3.4907759248685792, "grad_norm": 2.0514333248138428, "learning_rate": 8.722286235270031e-07, "loss": 2.0537, "num_input_tokens_seen": 180534240, "step": 177300 }, { "epoch": 3.492744777618082, "grad_norm": 2.177111864089966, "learning_rate": 8.720909313323999e-07, "loss": 2.0004, "num_input_tokens_seen": 180636640, "step": 177400 }, { "epoch": 3.4947136303675848, "grad_norm": 1.830773949623108, "learning_rate": 8.71953175866872e-07, "loss": 1.9714, "num_input_tokens_seen": 180737616, "step": 177500 }, { "epoch": 3.4966824831170875, "grad_norm": 1.8422319889068604, "learning_rate": 8.718153571538439e-07, "loss": 2.0145, "num_input_tokens_seen": 180838904, "step": 177600 }, { "epoch": 3.4986513358665907, "grad_norm": 1.8522621393203735, "learning_rate": 8.716774752167504e-07, "loss": 1.9813, "num_input_tokens_seen": 180941304, "step": 177700 }, { "epoch": 3.5006201886160935, "grad_norm": 1.8867874145507812, "learning_rate": 8.715395300790369e-07, "loss": 2.0262, "num_input_tokens_seen": 181042648, "step": 177800 }, { "epoch": 3.5025890413655962, "grad_norm": 2.1557350158691406, "learning_rate": 8.714015217641601e-07, "loss": 1.9373, "num_input_tokens_seen": 181145048, "step": 177900 }, { "epoch": 3.504557894115099, "grad_norm": 2.2549006938934326, "learning_rate": 8.712634502955872e-07, "loss": 2.0458, "num_input_tokens_seen": 181247448, "step": 178000 }, { "epoch": 3.506526746864602, "grad_norm": 19.7275333404541, "learning_rate": 8.711253156967959e-07, "loss": 2.0102, "num_input_tokens_seen": 181349704, "step": 178100 }, { "epoch": 3.508495599614105, "grad_norm": 1.8544641733169556, "learning_rate": 8.709871179912749e-07, "loss": 2.0012, "num_input_tokens_seen": 181450736, "step": 178200 }, { "epoch": 3.5104644523636077, "grad_norm": 2.025705575942993, "learning_rate": 8.708488572025234e-07, "loss": 1.9897, "num_input_tokens_seen": 181550728, "step": 178300 }, { "epoch": 3.5124333051131105, "grad_norm": 1.967421531677246, "learning_rate": 8.707105333540518e-07, "loss": 2.0217, "num_input_tokens_seen": 181653128, "step": 178400 }, { "epoch": 3.5144021578626132, "grad_norm": 1.8565772771835327, "learning_rate": 8.705721464693805e-07, "loss": 2.0151, "num_input_tokens_seen": 181755528, "step": 178500 }, { "epoch": 3.5163710106121164, "grad_norm": 1.5747811794281006, "learning_rate": 8.704336965720414e-07, "loss": 1.9748, "num_input_tokens_seen": 181857176, "step": 178600 }, { "epoch": 3.518339863361619, "grad_norm": 2.0144262313842773, "learning_rate": 8.702951836855766e-07, "loss": 1.9825, "num_input_tokens_seen": 181959576, "step": 178700 }, { "epoch": 3.520308716111122, "grad_norm": 2.0384912490844727, "learning_rate": 8.70156607833539e-07, "loss": 1.959, "num_input_tokens_seen": 182061536, "step": 178800 }, { "epoch": 3.522277568860625, "grad_norm": 1.6842730045318604, "learning_rate": 8.700179690394922e-07, "loss": 1.9947, "num_input_tokens_seen": 182163464, "step": 178900 }, { "epoch": 3.524246421610128, "grad_norm": 1.6675080060958862, "learning_rate": 8.698792673270108e-07, "loss": 1.9709, "num_input_tokens_seen": 182265160, "step": 179000 }, { "epoch": 3.5262152743596307, "grad_norm": 0.0, "learning_rate": 8.697405027196796e-07, "loss": 2.0081, "num_input_tokens_seen": 182366032, "step": 179100 }, { "epoch": 3.5281841271091334, "grad_norm": 2.0393857955932617, "learning_rate": 8.696016752410944e-07, "loss": 1.9512, "num_input_tokens_seen": 182468432, "step": 179200 }, { "epoch": 3.530152979858636, "grad_norm": 1.8243776559829712, "learning_rate": 8.694627849148616e-07, "loss": 2.0498, "num_input_tokens_seen": 182570832, "step": 179300 }, { "epoch": 3.5321218326081394, "grad_norm": 2.2270710468292236, "learning_rate": 8.693238317645988e-07, "loss": 1.9975, "num_input_tokens_seen": 182673232, "step": 179400 }, { "epoch": 3.534090685357642, "grad_norm": 1.792210340499878, "learning_rate": 8.691848158139331e-07, "loss": 1.9612, "num_input_tokens_seen": 182775632, "step": 179500 }, { "epoch": 3.536059538107145, "grad_norm": 2.04830002784729, "learning_rate": 8.690457370865034e-07, "loss": 1.9844, "num_input_tokens_seen": 182878032, "step": 179600 }, { "epoch": 3.538028390856648, "grad_norm": 2.1170568466186523, "learning_rate": 8.689065956059588e-07, "loss": 1.9488, "num_input_tokens_seen": 182979704, "step": 179700 }, { "epoch": 3.539997243606151, "grad_norm": 2.1218490600585938, "learning_rate": 8.687673913959591e-07, "loss": 1.9948, "num_input_tokens_seen": 183081480, "step": 179800 }, { "epoch": 3.5419660963556536, "grad_norm": 1.859352946281433, "learning_rate": 8.686281244801748e-07, "loss": 1.9588, "num_input_tokens_seen": 183183880, "step": 179900 }, { "epoch": 3.5439349491051564, "grad_norm": 1.9357470273971558, "learning_rate": 8.684887948822872e-07, "loss": 1.9905, "num_input_tokens_seen": 183286280, "step": 180000 }, { "epoch": 3.545903801854659, "grad_norm": 2.0041255950927734, "learning_rate": 8.683494026259879e-07, "loss": 2.01, "num_input_tokens_seen": 183388680, "step": 180100 }, { "epoch": 3.547872654604162, "grad_norm": 1.6589006185531616, "learning_rate": 8.682099477349795e-07, "loss": 1.9872, "num_input_tokens_seen": 183489744, "step": 180200 }, { "epoch": 3.549841507353665, "grad_norm": 2.102747917175293, "learning_rate": 8.680704302329751e-07, "loss": 2.0279, "num_input_tokens_seen": 183591624, "step": 180300 }, { "epoch": 3.551810360103168, "grad_norm": 1.8964042663574219, "learning_rate": 8.679308501436985e-07, "loss": 2.0346, "num_input_tokens_seen": 183694024, "step": 180400 }, { "epoch": 3.5537792128526706, "grad_norm": 2.020772933959961, "learning_rate": 8.677912074908842e-07, "loss": 2.0251, "num_input_tokens_seen": 183795032, "step": 180500 }, { "epoch": 3.555748065602174, "grad_norm": 5.046718597412109, "learning_rate": 8.67651502298277e-07, "loss": 1.9951, "num_input_tokens_seen": 183895856, "step": 180600 }, { "epoch": 3.5577169183516766, "grad_norm": 2.6011369228363037, "learning_rate": 8.675117345896328e-07, "loss": 1.9797, "num_input_tokens_seen": 183998160, "step": 180700 }, { "epoch": 3.5596857711011793, "grad_norm": 1.867495059967041, "learning_rate": 8.67371904388718e-07, "loss": 2.0258, "num_input_tokens_seen": 184100080, "step": 180800 }, { "epoch": 3.561654623850682, "grad_norm": 2.237550735473633, "learning_rate": 8.672320117193093e-07, "loss": 1.9952, "num_input_tokens_seen": 184200392, "step": 180900 }, { "epoch": 3.563623476600185, "grad_norm": 1.8391515016555786, "learning_rate": 8.670920566051943e-07, "loss": 1.9979, "num_input_tokens_seen": 184302000, "step": 181000 }, { "epoch": 3.565592329349688, "grad_norm": 1.8157563209533691, "learning_rate": 8.669520390701715e-07, "loss": 1.9619, "num_input_tokens_seen": 184404400, "step": 181100 }, { "epoch": 3.567561182099191, "grad_norm": 2.065551996231079, "learning_rate": 8.668119591380492e-07, "loss": 1.9436, "num_input_tokens_seen": 184506800, "step": 181200 }, { "epoch": 3.5695300348486936, "grad_norm": 1.8277415037155151, "learning_rate": 8.666718168326472e-07, "loss": 1.991, "num_input_tokens_seen": 184608424, "step": 181300 }, { "epoch": 3.571498887598197, "grad_norm": 1.8323853015899658, "learning_rate": 8.665316121777955e-07, "loss": 1.9495, "num_input_tokens_seen": 184710528, "step": 181400 }, { "epoch": 3.5734677403476995, "grad_norm": 1.8363794088363647, "learning_rate": 8.663913451973346e-07, "loss": 2.0341, "num_input_tokens_seen": 184810776, "step": 181500 }, { "epoch": 3.5754365930972023, "grad_norm": 1.8086886405944824, "learning_rate": 8.662510159151156e-07, "loss": 1.9911, "num_input_tokens_seen": 184913176, "step": 181600 }, { "epoch": 3.577405445846705, "grad_norm": 2.2289958000183105, "learning_rate": 8.661106243550005e-07, "loss": 2.0103, "num_input_tokens_seen": 185013848, "step": 181700 }, { "epoch": 3.579374298596208, "grad_norm": 1.8465604782104492, "learning_rate": 8.659701705408617e-07, "loss": 2.0237, "num_input_tokens_seen": 185115936, "step": 181800 }, { "epoch": 3.581343151345711, "grad_norm": 1.8184815645217896, "learning_rate": 8.658296544965822e-07, "loss": 1.962, "num_input_tokens_seen": 185217720, "step": 181900 }, { "epoch": 3.5833120040952138, "grad_norm": 2.013380527496338, "learning_rate": 8.656890762460553e-07, "loss": 1.9993, "num_input_tokens_seen": 185318928, "step": 182000 }, { "epoch": 3.5852808568447165, "grad_norm": 2.047400712966919, "learning_rate": 8.655484358131855e-07, "loss": 1.9998, "num_input_tokens_seen": 185419008, "step": 182100 }, { "epoch": 3.5872497095942193, "grad_norm": 1.7890983819961548, "learning_rate": 8.654077332218873e-07, "loss": 1.9903, "num_input_tokens_seen": 185520616, "step": 182200 }, { "epoch": 3.5892185623437225, "grad_norm": 2.072608709335327, "learning_rate": 8.65266968496086e-07, "loss": 1.9607, "num_input_tokens_seen": 185621336, "step": 182300 }, { "epoch": 3.5911874150932253, "grad_norm": 1.9646893739700317, "learning_rate": 8.651261416597178e-07, "loss": 2.0364, "num_input_tokens_seen": 185722904, "step": 182400 }, { "epoch": 3.593156267842728, "grad_norm": 1.9087058305740356, "learning_rate": 8.649852527367286e-07, "loss": 1.9962, "num_input_tokens_seen": 185825304, "step": 182500 }, { "epoch": 3.5951251205922308, "grad_norm": 1.9879136085510254, "learning_rate": 8.648443017510756e-07, "loss": 1.9647, "num_input_tokens_seen": 185926208, "step": 182600 }, { "epoch": 3.5970939733417335, "grad_norm": 1.9980089664459229, "learning_rate": 8.647032887267262e-07, "loss": 1.9971, "num_input_tokens_seen": 186027944, "step": 182700 }, { "epoch": 3.5990628260912367, "grad_norm": 1.8720948696136475, "learning_rate": 8.645622136876589e-07, "loss": 1.9784, "num_input_tokens_seen": 186129224, "step": 182800 }, { "epoch": 3.6010316788407395, "grad_norm": 2.2965409755706787, "learning_rate": 8.64421076657862e-07, "loss": 1.9903, "num_input_tokens_seen": 186230952, "step": 182900 }, { "epoch": 3.6030005315902423, "grad_norm": 2.0069820880889893, "learning_rate": 8.642798776613344e-07, "loss": 1.9691, "num_input_tokens_seen": 186333352, "step": 183000 }, { "epoch": 3.6049693843397455, "grad_norm": 1.9990017414093018, "learning_rate": 8.641386167220863e-07, "loss": 2.0193, "num_input_tokens_seen": 186435752, "step": 183100 }, { "epoch": 3.606938237089248, "grad_norm": 1.9313108921051025, "learning_rate": 8.639972938641375e-07, "loss": 1.96, "num_input_tokens_seen": 186536600, "step": 183200 }, { "epoch": 3.608907089838751, "grad_norm": 1.6989598274230957, "learning_rate": 8.638559091115191e-07, "loss": 1.9586, "num_input_tokens_seen": 186639000, "step": 183300 }, { "epoch": 3.6108759425882537, "grad_norm": 2.0604283809661865, "learning_rate": 8.637144624882724e-07, "loss": 2.0078, "num_input_tokens_seen": 186741400, "step": 183400 }, { "epoch": 3.6128447953377565, "grad_norm": 1.9702231884002686, "learning_rate": 8.63572954018449e-07, "loss": 1.9799, "num_input_tokens_seen": 186843048, "step": 183500 }, { "epoch": 3.6148136480872597, "grad_norm": 2.0850439071655273, "learning_rate": 8.634313837261111e-07, "loss": 1.9358, "num_input_tokens_seen": 186945448, "step": 183600 }, { "epoch": 3.6167825008367624, "grad_norm": 1.6503615379333496, "learning_rate": 8.632897516353319e-07, "loss": 1.9519, "num_input_tokens_seen": 187047848, "step": 183700 }, { "epoch": 3.618751353586265, "grad_norm": 1.742605209350586, "learning_rate": 8.631480577701947e-07, "loss": 1.9903, "num_input_tokens_seen": 187148944, "step": 183800 }, { "epoch": 3.6207202063357684, "grad_norm": 1.9477397203445435, "learning_rate": 8.630063021547929e-07, "loss": 1.9741, "num_input_tokens_seen": 187250504, "step": 183900 }, { "epoch": 3.622689059085271, "grad_norm": 1.8840001821517944, "learning_rate": 8.628644848132314e-07, "loss": 2.0098, "num_input_tokens_seen": 187352904, "step": 184000 }, { "epoch": 3.624657911834774, "grad_norm": 2.2380168437957764, "learning_rate": 8.627226057696245e-07, "loss": 2.0108, "num_input_tokens_seen": 187454432, "step": 184100 }, { "epoch": 3.6266267645842767, "grad_norm": 1.914587140083313, "learning_rate": 8.62580665048098e-07, "loss": 1.9512, "num_input_tokens_seen": 187556832, "step": 184200 }, { "epoch": 3.6285956173337794, "grad_norm": 1.8810619115829468, "learning_rate": 8.624386626727875e-07, "loss": 1.9921, "num_input_tokens_seen": 187658920, "step": 184300 }, { "epoch": 3.630564470083282, "grad_norm": 2.317134380340576, "learning_rate": 8.622965986678396e-07, "loss": 1.9784, "num_input_tokens_seen": 187760560, "step": 184400 }, { "epoch": 3.6325333228327854, "grad_norm": 1.864627718925476, "learning_rate": 8.621544730574106e-07, "loss": 1.9742, "num_input_tokens_seen": 187862960, "step": 184500 }, { "epoch": 3.634502175582288, "grad_norm": 1.7810827493667603, "learning_rate": 8.62012285865668e-07, "loss": 1.9929, "num_input_tokens_seen": 187963976, "step": 184600 }, { "epoch": 3.636471028331791, "grad_norm": 3.099287748336792, "learning_rate": 8.618700371167898e-07, "loss": 2.0315, "num_input_tokens_seen": 188065560, "step": 184700 }, { "epoch": 3.638439881081294, "grad_norm": 1.766345500946045, "learning_rate": 8.617277268349636e-07, "loss": 1.9878, "num_input_tokens_seen": 188167736, "step": 184800 }, { "epoch": 3.640408733830797, "grad_norm": 1.857459306716919, "learning_rate": 8.615853550443887e-07, "loss": 1.9637, "num_input_tokens_seen": 188269344, "step": 184900 }, { "epoch": 3.6423775865802996, "grad_norm": 1.7559006214141846, "learning_rate": 8.614429217692738e-07, "loss": 2.0003, "num_input_tokens_seen": 188371744, "step": 185000 }, { "epoch": 3.6443464393298024, "grad_norm": 1.821175217628479, "learning_rate": 8.613004270338384e-07, "loss": 2.0161, "num_input_tokens_seen": 188472352, "step": 185100 }, { "epoch": 3.646315292079305, "grad_norm": 1.9865384101867676, "learning_rate": 8.611578708623131e-07, "loss": 1.9891, "num_input_tokens_seen": 188574752, "step": 185200 }, { "epoch": 3.6482841448288084, "grad_norm": 2.212289571762085, "learning_rate": 8.610152532789377e-07, "loss": 1.9667, "num_input_tokens_seen": 188676600, "step": 185300 }, { "epoch": 3.650252997578311, "grad_norm": 1.9270789623260498, "learning_rate": 8.608725743079636e-07, "loss": 1.9937, "num_input_tokens_seen": 188779000, "step": 185400 }, { "epoch": 3.652221850327814, "grad_norm": 2.1115171909332275, "learning_rate": 8.60729833973652e-07, "loss": 1.9928, "num_input_tokens_seen": 188881400, "step": 185500 }, { "epoch": 3.654190703077317, "grad_norm": 1.9058347940444946, "learning_rate": 8.605870323002745e-07, "loss": 1.9409, "num_input_tokens_seen": 188983800, "step": 185600 }, { "epoch": 3.65615955582682, "grad_norm": 1.9823269844055176, "learning_rate": 8.604441693121136e-07, "loss": 1.9801, "num_input_tokens_seen": 189085472, "step": 185700 }, { "epoch": 3.6581284085763226, "grad_norm": 1.990706205368042, "learning_rate": 8.603012450334619e-07, "loss": 1.9457, "num_input_tokens_seen": 189187872, "step": 185800 }, { "epoch": 3.6600972613258254, "grad_norm": 1.8885613679885864, "learning_rate": 8.601582594886223e-07, "loss": 1.9638, "num_input_tokens_seen": 189289560, "step": 185900 }, { "epoch": 3.662066114075328, "grad_norm": 1.9282426834106445, "learning_rate": 8.600152127019085e-07, "loss": 1.9725, "num_input_tokens_seen": 189391960, "step": 186000 }, { "epoch": 3.6640349668248313, "grad_norm": 1.8946210145950317, "learning_rate": 8.598721046976441e-07, "loss": 1.9753, "num_input_tokens_seen": 189494360, "step": 186100 }, { "epoch": 3.666003819574334, "grad_norm": 1.8834301233291626, "learning_rate": 8.597289355001638e-07, "loss": 1.9869, "num_input_tokens_seen": 189596760, "step": 186200 }, { "epoch": 3.667972672323837, "grad_norm": 1.8436224460601807, "learning_rate": 8.595857051338121e-07, "loss": 1.9507, "num_input_tokens_seen": 189698688, "step": 186300 }, { "epoch": 3.66994152507334, "grad_norm": 1.9974068403244019, "learning_rate": 8.594424136229439e-07, "loss": 1.9651, "num_input_tokens_seen": 189800464, "step": 186400 }, { "epoch": 3.671910377822843, "grad_norm": 2.129647970199585, "learning_rate": 8.592990609919251e-07, "loss": 1.9953, "num_input_tokens_seen": 189901968, "step": 186500 }, { "epoch": 3.6738792305723456, "grad_norm": 1.7398163080215454, "learning_rate": 8.591556472651315e-07, "loss": 1.9789, "num_input_tokens_seen": 190004368, "step": 186600 }, { "epoch": 3.6758480833218483, "grad_norm": 2.255389451980591, "learning_rate": 8.590121724669492e-07, "loss": 1.9799, "num_input_tokens_seen": 190104880, "step": 186700 }, { "epoch": 3.677816936071351, "grad_norm": 6.116527080535889, "learning_rate": 8.58868636621775e-07, "loss": 1.9964, "num_input_tokens_seen": 190205592, "step": 186800 }, { "epoch": 3.679785788820854, "grad_norm": 2.1795454025268555, "learning_rate": 8.587250397540163e-07, "loss": 2.011, "num_input_tokens_seen": 190307888, "step": 186900 }, { "epoch": 3.681754641570357, "grad_norm": 2.2472429275512695, "learning_rate": 8.585813818880898e-07, "loss": 1.9983, "num_input_tokens_seen": 190409176, "step": 187000 }, { "epoch": 3.68372349431986, "grad_norm": 1.9176465272903442, "learning_rate": 8.58437663048424e-07, "loss": 1.9523, "num_input_tokens_seen": 190511576, "step": 187100 }, { "epoch": 3.6856923470693626, "grad_norm": 1.9553813934326172, "learning_rate": 8.582938832594566e-07, "loss": 1.9646, "num_input_tokens_seen": 190612832, "step": 187200 }, { "epoch": 3.6876611998188658, "grad_norm": 1.890999436378479, "learning_rate": 8.581500425456365e-07, "loss": 1.9459, "num_input_tokens_seen": 190715232, "step": 187300 }, { "epoch": 3.6896300525683685, "grad_norm": 2.7544963359832764, "learning_rate": 8.580061409314225e-07, "loss": 2.0061, "num_input_tokens_seen": 190816392, "step": 187400 }, { "epoch": 3.6915989053178713, "grad_norm": 1.9823936223983765, "learning_rate": 8.578621784412836e-07, "loss": 1.9828, "num_input_tokens_seen": 190918792, "step": 187500 }, { "epoch": 3.693567758067374, "grad_norm": 1.8486562967300415, "learning_rate": 8.577181550996998e-07, "loss": 1.9355, "num_input_tokens_seen": 191021192, "step": 187600 }, { "epoch": 3.695536610816877, "grad_norm": 1.8696311712265015, "learning_rate": 8.575740709311607e-07, "loss": 1.9764, "num_input_tokens_seen": 191123592, "step": 187700 }, { "epoch": 3.69750546356638, "grad_norm": 2.0588529109954834, "learning_rate": 8.57429925960167e-07, "loss": 1.9945, "num_input_tokens_seen": 191224936, "step": 187800 }, { "epoch": 3.6994743163158827, "grad_norm": 1.979798436164856, "learning_rate": 8.572857202112288e-07, "loss": 1.9826, "num_input_tokens_seen": 191327336, "step": 187900 }, { "epoch": 3.7014431690653855, "grad_norm": 2.138587713241577, "learning_rate": 8.571414537088675e-07, "loss": 1.9534, "num_input_tokens_seen": 191429736, "step": 188000 }, { "epoch": 3.7034120218148887, "grad_norm": 1.895857572555542, "learning_rate": 8.569971264776141e-07, "loss": 1.9745, "num_input_tokens_seen": 191531208, "step": 188100 }, { "epoch": 3.7053808745643915, "grad_norm": 1.8352304697036743, "learning_rate": 8.568527385420105e-07, "loss": 1.9534, "num_input_tokens_seen": 191633608, "step": 188200 }, { "epoch": 3.7073497273138942, "grad_norm": 2.2083489894866943, "learning_rate": 8.567082899266087e-07, "loss": 1.9698, "num_input_tokens_seen": 191735480, "step": 188300 }, { "epoch": 3.709318580063397, "grad_norm": 2.0960662364959717, "learning_rate": 8.565637806559707e-07, "loss": 1.9902, "num_input_tokens_seen": 191837880, "step": 188400 }, { "epoch": 3.7112874328128997, "grad_norm": 1.8242642879486084, "learning_rate": 8.564192107546693e-07, "loss": 1.977, "num_input_tokens_seen": 191940280, "step": 188500 }, { "epoch": 3.713256285562403, "grad_norm": 1.9868113994598389, "learning_rate": 8.562745802472871e-07, "loss": 1.9903, "num_input_tokens_seen": 192042680, "step": 188600 }, { "epoch": 3.7152251383119057, "grad_norm": 1.7235865592956543, "learning_rate": 8.561298891584177e-07, "loss": 1.9602, "num_input_tokens_seen": 192144536, "step": 188700 }, { "epoch": 3.7171939910614085, "grad_norm": 2.305779218673706, "learning_rate": 8.559851375126641e-07, "loss": 1.9796, "num_input_tokens_seen": 192246656, "step": 188800 }, { "epoch": 3.7191628438109112, "grad_norm": 1.788556456565857, "learning_rate": 8.558403253346407e-07, "loss": 2.01, "num_input_tokens_seen": 192348552, "step": 188900 }, { "epoch": 3.7211316965604144, "grad_norm": 3.87287974357605, "learning_rate": 8.556954526489711e-07, "loss": 1.9889, "num_input_tokens_seen": 192449680, "step": 189000 }, { "epoch": 3.723100549309917, "grad_norm": 2.1033787727355957, "learning_rate": 8.555505194802899e-07, "loss": 2.0566, "num_input_tokens_seen": 192551208, "step": 189100 }, { "epoch": 3.72506940205942, "grad_norm": 1.7448862791061401, "learning_rate": 8.554055258532415e-07, "loss": 1.9601, "num_input_tokens_seen": 192651576, "step": 189200 }, { "epoch": 3.7270382548089227, "grad_norm": 1.8359603881835938, "learning_rate": 8.552604717924813e-07, "loss": 1.9817, "num_input_tokens_seen": 192753976, "step": 189300 }, { "epoch": 3.7290071075584255, "grad_norm": 1.9870225191116333, "learning_rate": 8.551153573226742e-07, "loss": 1.9943, "num_input_tokens_seen": 192855344, "step": 189400 }, { "epoch": 3.7309759603079287, "grad_norm": 1.8825901746749878, "learning_rate": 8.549701824684958e-07, "loss": 1.9913, "num_input_tokens_seen": 192957104, "step": 189500 }, { "epoch": 3.7329448130574314, "grad_norm": 1.7999411821365356, "learning_rate": 8.548249472546318e-07, "loss": 1.9509, "num_input_tokens_seen": 193058936, "step": 189600 }, { "epoch": 3.734913665806934, "grad_norm": 1.7239336967468262, "learning_rate": 8.546796517057783e-07, "loss": 2.0122, "num_input_tokens_seen": 193159800, "step": 189700 }, { "epoch": 3.7368825185564374, "grad_norm": 2.1975951194763184, "learning_rate": 8.545342958466415e-07, "loss": 2.0278, "num_input_tokens_seen": 193260264, "step": 189800 }, { "epoch": 3.73885137130594, "grad_norm": 2.1563830375671387, "learning_rate": 8.543888797019379e-07, "loss": 1.99, "num_input_tokens_seen": 193361032, "step": 189900 }, { "epoch": 3.740820224055443, "grad_norm": 1.9104723930358887, "learning_rate": 8.542434032963945e-07, "loss": 2.0445, "num_input_tokens_seen": 193462888, "step": 190000 }, { "epoch": 3.7427890768049457, "grad_norm": 1.787787914276123, "learning_rate": 8.540978666547481e-07, "loss": 2.0123, "num_input_tokens_seen": 193564496, "step": 190100 }, { "epoch": 3.7447579295544484, "grad_norm": 1.7190896272659302, "learning_rate": 8.539522698017461e-07, "loss": 1.9979, "num_input_tokens_seen": 193666456, "step": 190200 }, { "epoch": 3.7467267823039516, "grad_norm": 1.9271742105484009, "learning_rate": 8.53806612762146e-07, "loss": 2.0014, "num_input_tokens_seen": 193768856, "step": 190300 }, { "epoch": 3.7486956350534544, "grad_norm": 1.8336818218231201, "learning_rate": 8.536608955607156e-07, "loss": 2.0196, "num_input_tokens_seen": 193870632, "step": 190400 }, { "epoch": 3.750664487802957, "grad_norm": 1.9987962245941162, "learning_rate": 8.535151182222327e-07, "loss": 1.9436, "num_input_tokens_seen": 193973032, "step": 190500 }, { "epoch": 3.7526333405524603, "grad_norm": 1.9764909744262695, "learning_rate": 8.533692807714858e-07, "loss": 1.997, "num_input_tokens_seen": 194075016, "step": 190600 }, { "epoch": 3.754602193301963, "grad_norm": 1.976149559020996, "learning_rate": 8.532233832332729e-07, "loss": 1.9966, "num_input_tokens_seen": 194176776, "step": 190700 }, { "epoch": 3.756571046051466, "grad_norm": 2.051839590072632, "learning_rate": 8.530774256324032e-07, "loss": 1.9886, "num_input_tokens_seen": 194278600, "step": 190800 }, { "epoch": 3.7585398988009686, "grad_norm": 1.795717477798462, "learning_rate": 8.529314079936951e-07, "loss": 2.0213, "num_input_tokens_seen": 194380088, "step": 190900 }, { "epoch": 3.7605087515504714, "grad_norm": 2.2465925216674805, "learning_rate": 8.527853303419779e-07, "loss": 1.9507, "num_input_tokens_seen": 194481016, "step": 191000 }, { "epoch": 3.762477604299974, "grad_norm": 1.9409157037734985, "learning_rate": 8.526391927020907e-07, "loss": 2.0071, "num_input_tokens_seen": 194580744, "step": 191100 }, { "epoch": 3.7644464570494773, "grad_norm": 1.8164609670639038, "learning_rate": 8.52492995098883e-07, "loss": 1.9863, "num_input_tokens_seen": 194682592, "step": 191200 }, { "epoch": 3.76641530979898, "grad_norm": 1.8520736694335938, "learning_rate": 8.523467375572148e-07, "loss": 1.9554, "num_input_tokens_seen": 194784152, "step": 191300 }, { "epoch": 3.768384162548483, "grad_norm": 1.793675184249878, "learning_rate": 8.522004201019556e-07, "loss": 1.9964, "num_input_tokens_seen": 194884664, "step": 191400 }, { "epoch": 3.770353015297986, "grad_norm": 1.9452241659164429, "learning_rate": 8.520540427579854e-07, "loss": 1.9821, "num_input_tokens_seen": 194987064, "step": 191500 }, { "epoch": 3.772321868047489, "grad_norm": 1.653780221939087, "learning_rate": 8.519076055501948e-07, "loss": 1.9732, "num_input_tokens_seen": 195088424, "step": 191600 }, { "epoch": 3.7742907207969916, "grad_norm": 2.1882855892181396, "learning_rate": 8.517611085034839e-07, "loss": 1.973, "num_input_tokens_seen": 195190344, "step": 191700 }, { "epoch": 3.7762595735464943, "grad_norm": 1.9713499546051025, "learning_rate": 8.516145516427635e-07, "loss": 1.9973, "num_input_tokens_seen": 195290768, "step": 191800 }, { "epoch": 3.778228426295997, "grad_norm": 1.9395986795425415, "learning_rate": 8.514679349929541e-07, "loss": 1.9831, "num_input_tokens_seen": 195393168, "step": 191900 }, { "epoch": 3.7801972790455003, "grad_norm": 2.054429292678833, "learning_rate": 8.513212585789869e-07, "loss": 1.9653, "num_input_tokens_seen": 195495224, "step": 192000 }, { "epoch": 3.782166131795003, "grad_norm": 2.1292591094970703, "learning_rate": 8.511745224258028e-07, "loss": 1.9975, "num_input_tokens_seen": 195597624, "step": 192100 }, { "epoch": 3.784134984544506, "grad_norm": 2.0953760147094727, "learning_rate": 8.510277265583532e-07, "loss": 1.9709, "num_input_tokens_seen": 195700024, "step": 192200 }, { "epoch": 3.786103837294009, "grad_norm": 1.8644973039627075, "learning_rate": 8.508808710015992e-07, "loss": 1.9736, "num_input_tokens_seen": 195801880, "step": 192300 }, { "epoch": 3.7880726900435118, "grad_norm": 1.8061989545822144, "learning_rate": 8.507339557805127e-07, "loss": 1.9991, "num_input_tokens_seen": 195903720, "step": 192400 }, { "epoch": 3.7900415427930145, "grad_norm": 2.189606189727783, "learning_rate": 8.505869809200752e-07, "loss": 2.0377, "num_input_tokens_seen": 196006120, "step": 192500 }, { "epoch": 3.7920103955425173, "grad_norm": 1.904588222503662, "learning_rate": 8.504399464452785e-07, "loss": 1.9689, "num_input_tokens_seen": 196108520, "step": 192600 }, { "epoch": 3.79397924829202, "grad_norm": 2.192648410797119, "learning_rate": 8.502928523811249e-07, "loss": 1.9871, "num_input_tokens_seen": 196210176, "step": 192700 }, { "epoch": 3.7959481010415232, "grad_norm": 1.6559892892837524, "learning_rate": 8.50145698752626e-07, "loss": 1.98, "num_input_tokens_seen": 196312576, "step": 192800 }, { "epoch": 3.797916953791026, "grad_norm": 2.859254837036133, "learning_rate": 8.499984855848044e-07, "loss": 2.0401, "num_input_tokens_seen": 196414976, "step": 192900 }, { "epoch": 3.7998858065405288, "grad_norm": 2.1295316219329834, "learning_rate": 8.498512129026924e-07, "loss": 2.027, "num_input_tokens_seen": 196516168, "step": 193000 }, { "epoch": 3.801854659290032, "grad_norm": 2.002180814743042, "learning_rate": 8.497038807313324e-07, "loss": 2.0009, "num_input_tokens_seen": 196618568, "step": 193100 }, { "epoch": 3.8038235120395347, "grad_norm": 1.9041650295257568, "learning_rate": 8.495564890957769e-07, "loss": 2.003, "num_input_tokens_seen": 196719792, "step": 193200 }, { "epoch": 3.8057923647890375, "grad_norm": 2.1067962646484375, "learning_rate": 8.494090380210888e-07, "loss": 1.9892, "num_input_tokens_seen": 196821376, "step": 193300 }, { "epoch": 3.8077612175385402, "grad_norm": 2.053758382797241, "learning_rate": 8.492615275323409e-07, "loss": 1.9502, "num_input_tokens_seen": 196923776, "step": 193400 }, { "epoch": 3.809730070288043, "grad_norm": 2.005551815032959, "learning_rate": 8.49113957654616e-07, "loss": 1.9644, "num_input_tokens_seen": 197023584, "step": 193500 }, { "epoch": 3.8116989230375458, "grad_norm": 1.8815425634384155, "learning_rate": 8.489663284130071e-07, "loss": 1.9929, "num_input_tokens_seen": 197125984, "step": 193600 }, { "epoch": 3.813667775787049, "grad_norm": 1.7223732471466064, "learning_rate": 8.488186398326175e-07, "loss": 1.9873, "num_input_tokens_seen": 197227784, "step": 193700 }, { "epoch": 3.8156366285365517, "grad_norm": 1.9018996953964233, "learning_rate": 8.486708919385601e-07, "loss": 1.9699, "num_input_tokens_seen": 197330184, "step": 193800 }, { "epoch": 3.8176054812860545, "grad_norm": 1.9016982316970825, "learning_rate": 8.485230847559586e-07, "loss": 1.9702, "num_input_tokens_seen": 197432584, "step": 193900 }, { "epoch": 3.8195743340355577, "grad_norm": 2.1803274154663086, "learning_rate": 8.48375218309946e-07, "loss": 1.9834, "num_input_tokens_seen": 197534984, "step": 194000 }, { "epoch": 3.8215431867850604, "grad_norm": 1.8374851942062378, "learning_rate": 8.482272926256657e-07, "loss": 2.0257, "num_input_tokens_seen": 197636808, "step": 194100 }, { "epoch": 3.823512039534563, "grad_norm": 1.8388861417770386, "learning_rate": 8.480793077282715e-07, "loss": 1.9356, "num_input_tokens_seen": 197739208, "step": 194200 }, { "epoch": 3.825480892284066, "grad_norm": 3.2752463817596436, "learning_rate": 8.47931263642927e-07, "loss": 1.9702, "num_input_tokens_seen": 197840224, "step": 194300 }, { "epoch": 3.8274497450335687, "grad_norm": 1.958634614944458, "learning_rate": 8.477831603948056e-07, "loss": 1.9928, "num_input_tokens_seen": 197941960, "step": 194400 }, { "epoch": 3.829418597783072, "grad_norm": 1.793360948562622, "learning_rate": 8.47634998009091e-07, "loss": 1.9904, "num_input_tokens_seen": 198043864, "step": 194500 }, { "epoch": 3.8313874505325747, "grad_norm": 1.9204010963439941, "learning_rate": 8.474867765109772e-07, "loss": 2.0096, "num_input_tokens_seen": 198145640, "step": 194600 }, { "epoch": 3.8333563032820774, "grad_norm": 2.309009552001953, "learning_rate": 8.473384959256678e-07, "loss": 1.979, "num_input_tokens_seen": 198248040, "step": 194700 }, { "epoch": 3.8353251560315806, "grad_norm": 1.9729424715042114, "learning_rate": 8.471901562783767e-07, "loss": 2.0032, "num_input_tokens_seen": 198348840, "step": 194800 }, { "epoch": 3.8372940087810834, "grad_norm": 1.9262510538101196, "learning_rate": 8.47041757594328e-07, "loss": 1.9444, "num_input_tokens_seen": 198451240, "step": 194900 }, { "epoch": 3.839262861530586, "grad_norm": 2.2204275131225586, "learning_rate": 8.468932998987556e-07, "loss": 1.9965, "num_input_tokens_seen": 198552328, "step": 195000 }, { "epoch": 3.841231714280089, "grad_norm": 1.9872573614120483, "learning_rate": 8.467447832169032e-07, "loss": 2.0584, "num_input_tokens_seen": 198653104, "step": 195100 }, { "epoch": 3.8432005670295917, "grad_norm": 2.0090408325195312, "learning_rate": 8.465962075740252e-07, "loss": 1.9778, "num_input_tokens_seen": 198754936, "step": 195200 }, { "epoch": 3.845169419779095, "grad_norm": 2.006701946258545, "learning_rate": 8.464475729953853e-07, "loss": 1.969, "num_input_tokens_seen": 198857336, "step": 195300 }, { "epoch": 3.8471382725285976, "grad_norm": 1.786949634552002, "learning_rate": 8.462988795062575e-07, "loss": 1.9855, "num_input_tokens_seen": 198959736, "step": 195400 }, { "epoch": 3.8491071252781004, "grad_norm": 1.8043653964996338, "learning_rate": 8.461501271319262e-07, "loss": 1.9842, "num_input_tokens_seen": 199062136, "step": 195500 }, { "epoch": 3.851075978027603, "grad_norm": 1.873810887336731, "learning_rate": 8.460013158976856e-07, "loss": 2.0087, "num_input_tokens_seen": 199164536, "step": 195600 }, { "epoch": 3.8530448307771064, "grad_norm": 1.781235933303833, "learning_rate": 8.458524458288392e-07, "loss": 2.0274, "num_input_tokens_seen": 199266432, "step": 195700 }, { "epoch": 3.855013683526609, "grad_norm": 1.8602921962738037, "learning_rate": 8.457035169507017e-07, "loss": 1.9717, "num_input_tokens_seen": 199368832, "step": 195800 }, { "epoch": 3.856982536276112, "grad_norm": 11.825242042541504, "learning_rate": 8.455545292885966e-07, "loss": 2.0065, "num_input_tokens_seen": 199470312, "step": 195900 }, { "epoch": 3.8589513890256146, "grad_norm": 1.8490667343139648, "learning_rate": 8.454054828678586e-07, "loss": 1.9557, "num_input_tokens_seen": 199572712, "step": 196000 }, { "epoch": 3.8609202417751174, "grad_norm": 1.6781002283096313, "learning_rate": 8.452563777138315e-07, "loss": 2.0068, "num_input_tokens_seen": 199674312, "step": 196100 }, { "epoch": 3.8628890945246206, "grad_norm": 1.9158780574798584, "learning_rate": 8.451072138518694e-07, "loss": 1.9492, "num_input_tokens_seen": 199776712, "step": 196200 }, { "epoch": 3.8648579472741234, "grad_norm": 1.8332237005233765, "learning_rate": 8.449579913073362e-07, "loss": 1.9589, "num_input_tokens_seen": 199876728, "step": 196300 }, { "epoch": 3.866826800023626, "grad_norm": 1.9842472076416016, "learning_rate": 8.448087101056063e-07, "loss": 1.996, "num_input_tokens_seen": 199978424, "step": 196400 }, { "epoch": 3.8687956527731293, "grad_norm": 2.268526315689087, "learning_rate": 8.446593702720633e-07, "loss": 1.987, "num_input_tokens_seen": 200079264, "step": 196500 }, { "epoch": 3.870764505522632, "grad_norm": 1.9832903146743774, "learning_rate": 8.445099718321014e-07, "loss": 1.9845, "num_input_tokens_seen": 200180888, "step": 196600 }, { "epoch": 3.872733358272135, "grad_norm": 1.9900301694869995, "learning_rate": 8.443605148111247e-07, "loss": 1.9131, "num_input_tokens_seen": 200283288, "step": 196700 }, { "epoch": 3.8747022110216376, "grad_norm": 1.9500422477722168, "learning_rate": 8.442109992345468e-07, "loss": 1.9981, "num_input_tokens_seen": 200384456, "step": 196800 }, { "epoch": 3.8766710637711403, "grad_norm": 1.8457444906234741, "learning_rate": 8.440614251277918e-07, "loss": 1.9838, "num_input_tokens_seen": 200485656, "step": 196900 }, { "epoch": 3.8786399165206435, "grad_norm": 1.8744587898254395, "learning_rate": 8.439117925162933e-07, "loss": 2.0387, "num_input_tokens_seen": 200587456, "step": 197000 }, { "epoch": 3.8806087692701463, "grad_norm": 2.4993038177490234, "learning_rate": 8.437621014254952e-07, "loss": 2.0066, "num_input_tokens_seen": 200689856, "step": 197100 }, { "epoch": 3.882577622019649, "grad_norm": 2.0390734672546387, "learning_rate": 8.436123518808511e-07, "loss": 1.9351, "num_input_tokens_seen": 200792256, "step": 197200 }, { "epoch": 3.8845464747691523, "grad_norm": 1.6963281631469727, "learning_rate": 8.434625439078247e-07, "loss": 2.0019, "num_input_tokens_seen": 200893392, "step": 197300 }, { "epoch": 3.886515327518655, "grad_norm": 2.2399237155914307, "learning_rate": 8.433126775318899e-07, "loss": 1.9791, "num_input_tokens_seen": 200995272, "step": 197400 }, { "epoch": 3.888484180268158, "grad_norm": 1.9176384210586548, "learning_rate": 8.431627527785297e-07, "loss": 2.0071, "num_input_tokens_seen": 201097672, "step": 197500 }, { "epoch": 3.8904530330176605, "grad_norm": 1.8128925561904907, "learning_rate": 8.430127696732376e-07, "loss": 1.9774, "num_input_tokens_seen": 201200072, "step": 197600 }, { "epoch": 3.8924218857671633, "grad_norm": 2.0059993267059326, "learning_rate": 8.428627282415173e-07, "loss": 1.973, "num_input_tokens_seen": 201301280, "step": 197700 }, { "epoch": 3.894390738516666, "grad_norm": 2.077951669692993, "learning_rate": 8.427126285088819e-07, "loss": 1.9979, "num_input_tokens_seen": 201403680, "step": 197800 }, { "epoch": 3.8963595912661693, "grad_norm": 1.7671644687652588, "learning_rate": 8.425624705008546e-07, "loss": 1.9887, "num_input_tokens_seen": 201505152, "step": 197900 }, { "epoch": 3.898328444015672, "grad_norm": 1.8615261316299438, "learning_rate": 8.424122542429685e-07, "loss": 1.9901, "num_input_tokens_seen": 201607552, "step": 198000 }, { "epoch": 3.900297296765175, "grad_norm": 1.9583107233047485, "learning_rate": 8.422619797607667e-07, "loss": 1.9962, "num_input_tokens_seen": 201709952, "step": 198100 }, { "epoch": 3.902266149514678, "grad_norm": 1.7911033630371094, "learning_rate": 8.421116470798018e-07, "loss": 1.9556, "num_input_tokens_seen": 201810960, "step": 198200 }, { "epoch": 3.9042350022641807, "grad_norm": 1.959963321685791, "learning_rate": 8.419612562256369e-07, "loss": 2.028, "num_input_tokens_seen": 201913360, "step": 198300 }, { "epoch": 3.9062038550136835, "grad_norm": 1.7791067361831665, "learning_rate": 8.418108072238446e-07, "loss": 1.9523, "num_input_tokens_seen": 202015176, "step": 198400 }, { "epoch": 3.9081727077631863, "grad_norm": 2.198005437850952, "learning_rate": 8.416603001000075e-07, "loss": 2.0028, "num_input_tokens_seen": 202116712, "step": 198500 }, { "epoch": 3.910141560512689, "grad_norm": 2.0756611824035645, "learning_rate": 8.415097348797181e-07, "loss": 1.9562, "num_input_tokens_seen": 202219112, "step": 198600 }, { "epoch": 3.912110413262192, "grad_norm": 2.0650599002838135, "learning_rate": 8.413591115885788e-07, "loss": 1.9894, "num_input_tokens_seen": 202321512, "step": 198700 }, { "epoch": 3.914079266011695, "grad_norm": 2.096550226211548, "learning_rate": 8.412084302522016e-07, "loss": 1.996, "num_input_tokens_seen": 202423496, "step": 198800 }, { "epoch": 3.9160481187611977, "grad_norm": 1.5226213932037354, "learning_rate": 8.410576908962087e-07, "loss": 1.9765, "num_input_tokens_seen": 202525256, "step": 198900 }, { "epoch": 3.918016971510701, "grad_norm": 2.0522027015686035, "learning_rate": 8.409068935462322e-07, "loss": 1.9696, "num_input_tokens_seen": 202627656, "step": 199000 }, { "epoch": 3.9199858242602037, "grad_norm": 1.8121548891067505, "learning_rate": 8.407560382279136e-07, "loss": 1.9455, "num_input_tokens_seen": 202730056, "step": 199100 }, { "epoch": 3.9219546770097065, "grad_norm": 1.8804686069488525, "learning_rate": 8.406051249669049e-07, "loss": 1.9787, "num_input_tokens_seen": 202831760, "step": 199200 }, { "epoch": 3.923923529759209, "grad_norm": 1.9517990350723267, "learning_rate": 8.404541537888674e-07, "loss": 1.9847, "num_input_tokens_seen": 202934160, "step": 199300 }, { "epoch": 3.925892382508712, "grad_norm": 2.3334410190582275, "learning_rate": 8.403031247194728e-07, "loss": 1.9484, "num_input_tokens_seen": 203036560, "step": 199400 }, { "epoch": 3.927861235258215, "grad_norm": 1.965456247329712, "learning_rate": 8.401520377844018e-07, "loss": 1.959, "num_input_tokens_seen": 203138032, "step": 199500 }, { "epoch": 3.929830088007718, "grad_norm": 1.795345425605774, "learning_rate": 8.400008930093459e-07, "loss": 1.9658, "num_input_tokens_seen": 203240432, "step": 199600 }, { "epoch": 3.9317989407572207, "grad_norm": 2.119727611541748, "learning_rate": 8.398496904200059e-07, "loss": 1.9998, "num_input_tokens_seen": 203340840, "step": 199700 }, { "epoch": 3.933767793506724, "grad_norm": 1.9842429161071777, "learning_rate": 8.396984300420924e-07, "loss": 2.0053, "num_input_tokens_seen": 203440832, "step": 199800 }, { "epoch": 3.9357366462562267, "grad_norm": 2.071290969848633, "learning_rate": 8.39547111901326e-07, "loss": 2.041, "num_input_tokens_seen": 203542464, "step": 199900 }, { "epoch": 3.9377054990057294, "grad_norm": 1.716937780380249, "learning_rate": 8.393957360234371e-07, "loss": 1.9668, "num_input_tokens_seen": 203644096, "step": 200000 }, { "epoch": 3.939674351755232, "grad_norm": 1.91750168800354, "learning_rate": 8.39244302434166e-07, "loss": 2.036, "num_input_tokens_seen": 203746496, "step": 200100 }, { "epoch": 3.941643204504735, "grad_norm": 1.9597529172897339, "learning_rate": 8.390928111592624e-07, "loss": 1.998, "num_input_tokens_seen": 203848288, "step": 200200 }, { "epoch": 3.9436120572542377, "grad_norm": 2.6929409503936768, "learning_rate": 8.389412622244865e-07, "loss": 1.9637, "num_input_tokens_seen": 203950688, "step": 200300 }, { "epoch": 3.945580910003741, "grad_norm": 1.871503233909607, "learning_rate": 8.387896556556076e-07, "loss": 2.0262, "num_input_tokens_seen": 204053088, "step": 200400 }, { "epoch": 3.9475497627532437, "grad_norm": 1.8585203886032104, "learning_rate": 8.386379914784052e-07, "loss": 1.9907, "num_input_tokens_seen": 204155488, "step": 200500 }, { "epoch": 3.9495186155027464, "grad_norm": 2.4684081077575684, "learning_rate": 8.384862697186685e-07, "loss": 1.9869, "num_input_tokens_seen": 204256816, "step": 200600 }, { "epoch": 3.9514874682522496, "grad_norm": 1.9001214504241943, "learning_rate": 8.383344904021967e-07, "loss": 1.9518, "num_input_tokens_seen": 204358720, "step": 200700 }, { "epoch": 3.9534563210017524, "grad_norm": 2.191331624984741, "learning_rate": 8.381826535547985e-07, "loss": 2.001, "num_input_tokens_seen": 204461120, "step": 200800 }, { "epoch": 3.955425173751255, "grad_norm": 1.946645736694336, "learning_rate": 8.380307592022924e-07, "loss": 1.9508, "num_input_tokens_seen": 204563520, "step": 200900 }, { "epoch": 3.957394026500758, "grad_norm": 1.9473180770874023, "learning_rate": 8.378788073705068e-07, "loss": 1.9856, "num_input_tokens_seen": 204664960, "step": 201000 }, { "epoch": 3.9593628792502606, "grad_norm": 1.8633493185043335, "learning_rate": 8.377267980852795e-07, "loss": 2.0254, "num_input_tokens_seen": 204766272, "step": 201100 }, { "epoch": 3.961331731999764, "grad_norm": 1.9327882528305054, "learning_rate": 8.375747313724591e-07, "loss": 1.9597, "num_input_tokens_seen": 204868136, "step": 201200 }, { "epoch": 3.9633005847492666, "grad_norm": 2.6450247764587402, "learning_rate": 8.374226072579027e-07, "loss": 2.0266, "num_input_tokens_seen": 204970536, "step": 201300 }, { "epoch": 3.9652694374987694, "grad_norm": 1.8664968013763428, "learning_rate": 8.372704257674779e-07, "loss": 1.9837, "num_input_tokens_seen": 205072080, "step": 201400 }, { "epoch": 3.9672382902482726, "grad_norm": 1.7720860242843628, "learning_rate": 8.371181869270618e-07, "loss": 2.0002, "num_input_tokens_seen": 205173616, "step": 201500 }, { "epoch": 3.9692071429977753, "grad_norm": 2.5905239582061768, "learning_rate": 8.369658907625415e-07, "loss": 1.9925, "num_input_tokens_seen": 205275200, "step": 201600 }, { "epoch": 3.971175995747278, "grad_norm": 2.0033373832702637, "learning_rate": 8.368135372998135e-07, "loss": 2.0251, "num_input_tokens_seen": 205375624, "step": 201700 }, { "epoch": 3.973144848496781, "grad_norm": 2.1199939250946045, "learning_rate": 8.366611265647844e-07, "loss": 1.9952, "num_input_tokens_seen": 205478024, "step": 201800 }, { "epoch": 3.9751137012462836, "grad_norm": 2.075946092605591, "learning_rate": 8.365086585833701e-07, "loss": 2.0151, "num_input_tokens_seen": 205579664, "step": 201900 }, { "epoch": 3.977082553995787, "grad_norm": 1.9389138221740723, "learning_rate": 8.363561333814968e-07, "loss": 1.9899, "num_input_tokens_seen": 205681512, "step": 202000 }, { "epoch": 3.9790514067452896, "grad_norm": 1.8847119808197021, "learning_rate": 8.362035509851e-07, "loss": 2.0283, "num_input_tokens_seen": 205782336, "step": 202100 }, { "epoch": 3.9810202594947923, "grad_norm": 5.401323318481445, "learning_rate": 8.36050911420125e-07, "loss": 1.9538, "num_input_tokens_seen": 205883464, "step": 202200 }, { "epoch": 3.9829891122442955, "grad_norm": 2.1095244884490967, "learning_rate": 8.358982147125269e-07, "loss": 1.9767, "num_input_tokens_seen": 205985864, "step": 202300 }, { "epoch": 3.9849579649937983, "grad_norm": 2.071115255355835, "learning_rate": 8.357454608882704e-07, "loss": 1.9626, "num_input_tokens_seen": 206087512, "step": 202400 }, { "epoch": 3.986926817743301, "grad_norm": 1.9583262205123901, "learning_rate": 8.355926499733301e-07, "loss": 1.9848, "num_input_tokens_seen": 206188584, "step": 202500 }, { "epoch": 3.988895670492804, "grad_norm": 1.683656096458435, "learning_rate": 8.354397819936902e-07, "loss": 1.9682, "num_input_tokens_seen": 206289464, "step": 202600 }, { "epoch": 3.9908645232423066, "grad_norm": 2.043354034423828, "learning_rate": 8.352868569753446e-07, "loss": 1.9459, "num_input_tokens_seen": 206390680, "step": 202700 }, { "epoch": 3.9928333759918093, "grad_norm": 1.8235090970993042, "learning_rate": 8.351338749442969e-07, "loss": 1.9382, "num_input_tokens_seen": 206493080, "step": 202800 }, { "epoch": 3.9948022287413125, "grad_norm": 1.6625016927719116, "learning_rate": 8.349808359265604e-07, "loss": 2.0174, "num_input_tokens_seen": 206594872, "step": 202900 }, { "epoch": 3.9967710814908153, "grad_norm": 1.902225375175476, "learning_rate": 8.348277399481583e-07, "loss": 2.0029, "num_input_tokens_seen": 206696512, "step": 203000 }, { "epoch": 3.998739934240318, "grad_norm": 2.0428802967071533, "learning_rate": 8.346745870351228e-07, "loss": 1.9667, "num_input_tokens_seen": 206798912, "step": 203100 }, { "epoch": 4.000708786989821, "grad_norm": 1.9964284896850586, "learning_rate": 8.345213772134968e-07, "loss": 1.9123, "num_input_tokens_seen": 206901312, "step": 203200 }, { "epoch": 4.002677639739324, "grad_norm": 2.084804058074951, "learning_rate": 8.34368110509332e-07, "loss": 1.982, "num_input_tokens_seen": 207003112, "step": 203300 }, { "epoch": 4.004646492488827, "grad_norm": 1.9210829734802246, "learning_rate": 8.342147869486902e-07, "loss": 1.9427, "num_input_tokens_seen": 207105512, "step": 203400 }, { "epoch": 4.0066153452383295, "grad_norm": 1.6910932064056396, "learning_rate": 8.340614065576427e-07, "loss": 1.9943, "num_input_tokens_seen": 207206536, "step": 203500 }, { "epoch": 4.008584197987832, "grad_norm": 1.9586155414581299, "learning_rate": 8.339079693622706e-07, "loss": 1.9707, "num_input_tokens_seen": 207308400, "step": 203600 }, { "epoch": 4.010553050737335, "grad_norm": 1.8755433559417725, "learning_rate": 8.337544753886648e-07, "loss": 1.9741, "num_input_tokens_seen": 207410800, "step": 203700 }, { "epoch": 4.012521903486838, "grad_norm": 1.9017244577407837, "learning_rate": 8.336009246629253e-07, "loss": 1.9616, "num_input_tokens_seen": 207512616, "step": 203800 }, { "epoch": 4.014490756236341, "grad_norm": 1.9800962209701538, "learning_rate": 8.334473172111624e-07, "loss": 1.9774, "num_input_tokens_seen": 207614440, "step": 203900 }, { "epoch": 4.016459608985844, "grad_norm": 2.0316872596740723, "learning_rate": 8.332936530594956e-07, "loss": 1.9815, "num_input_tokens_seen": 207715888, "step": 204000 }, { "epoch": 4.018428461735347, "grad_norm": 1.707642674446106, "learning_rate": 8.331399322340542e-07, "loss": 1.9691, "num_input_tokens_seen": 207818288, "step": 204100 }, { "epoch": 4.02039731448485, "grad_norm": 1.788368582725525, "learning_rate": 8.329861547609771e-07, "loss": 1.9727, "num_input_tokens_seen": 207920688, "step": 204200 }, { "epoch": 4.0223661672343525, "grad_norm": 1.995225191116333, "learning_rate": 8.328323206664129e-07, "loss": 1.9449, "num_input_tokens_seen": 208022560, "step": 204300 }, { "epoch": 4.024335019983855, "grad_norm": 2.0321285724639893, "learning_rate": 8.3267842997652e-07, "loss": 2.0058, "num_input_tokens_seen": 208122328, "step": 204400 }, { "epoch": 4.026303872733358, "grad_norm": 1.9423996210098267, "learning_rate": 8.32524482717466e-07, "loss": 1.9511, "num_input_tokens_seen": 208224728, "step": 204500 }, { "epoch": 4.028272725482861, "grad_norm": 1.8356170654296875, "learning_rate": 8.323704789154282e-07, "loss": 1.962, "num_input_tokens_seen": 208327128, "step": 204600 }, { "epoch": 4.030241578232364, "grad_norm": 1.8499250411987305, "learning_rate": 8.322164185965939e-07, "loss": 2.0148, "num_input_tokens_seen": 208429528, "step": 204700 }, { "epoch": 4.032210430981867, "grad_norm": 1.967336893081665, "learning_rate": 8.320623017871596e-07, "loss": 1.956, "num_input_tokens_seen": 208531264, "step": 204800 }, { "epoch": 4.03417928373137, "grad_norm": 2.0016117095947266, "learning_rate": 8.319081285133316e-07, "loss": 1.96, "num_input_tokens_seen": 208632984, "step": 204900 }, { "epoch": 4.036148136480873, "grad_norm": 1.9814389944076538, "learning_rate": 8.317538988013259e-07, "loss": 1.9643, "num_input_tokens_seen": 208735384, "step": 205000 }, { "epoch": 4.038116989230375, "grad_norm": 2.057405710220337, "learning_rate": 8.315996126773678e-07, "loss": 1.9073, "num_input_tokens_seen": 208837336, "step": 205100 }, { "epoch": 4.040085841979878, "grad_norm": 1.9322025775909424, "learning_rate": 8.314452701676924e-07, "loss": 1.9671, "num_input_tokens_seen": 208939544, "step": 205200 }, { "epoch": 4.042054694729381, "grad_norm": 2.1665055751800537, "learning_rate": 8.312908712985443e-07, "loss": 1.9526, "num_input_tokens_seen": 209041944, "step": 205300 }, { "epoch": 4.044023547478884, "grad_norm": 2.12907075881958, "learning_rate": 8.311364160961778e-07, "loss": 2.0005, "num_input_tokens_seen": 209143344, "step": 205400 }, { "epoch": 4.045992400228387, "grad_norm": 2.1272454261779785, "learning_rate": 8.309819045868567e-07, "loss": 1.9922, "num_input_tokens_seen": 209244968, "step": 205500 }, { "epoch": 4.04796125297789, "grad_norm": 2.0681750774383545, "learning_rate": 8.308273367968543e-07, "loss": 2.0, "num_input_tokens_seen": 209347368, "step": 205600 }, { "epoch": 4.049930105727393, "grad_norm": 1.7734335660934448, "learning_rate": 8.306727127524536e-07, "loss": 1.9963, "num_input_tokens_seen": 209449768, "step": 205700 }, { "epoch": 4.051898958476896, "grad_norm": 2.0544943809509277, "learning_rate": 8.305180324799472e-07, "loss": 1.9861, "num_input_tokens_seen": 209551592, "step": 205800 }, { "epoch": 4.053867811226398, "grad_norm": 3.1180179119110107, "learning_rate": 8.303632960056372e-07, "loss": 1.9687, "num_input_tokens_seen": 209652912, "step": 205900 }, { "epoch": 4.055836663975901, "grad_norm": 1.825851321220398, "learning_rate": 8.302085033558349e-07, "loss": 1.9902, "num_input_tokens_seen": 209754880, "step": 206000 }, { "epoch": 4.057805516725404, "grad_norm": 1.6018146276474, "learning_rate": 8.300536545568618e-07, "loss": 1.9743, "num_input_tokens_seen": 209857280, "step": 206100 }, { "epoch": 4.059774369474907, "grad_norm": 2.071413993835449, "learning_rate": 8.298987496350486e-07, "loss": 2.0353, "num_input_tokens_seen": 209958416, "step": 206200 }, { "epoch": 4.061743222224409, "grad_norm": 1.945900321006775, "learning_rate": 8.297437886167356e-07, "loss": 1.9995, "num_input_tokens_seen": 210060256, "step": 206300 }, { "epoch": 4.063712074973913, "grad_norm": 2.2497408390045166, "learning_rate": 8.295887715282723e-07, "loss": 2.0072, "num_input_tokens_seen": 210162656, "step": 206400 }, { "epoch": 4.065680927723416, "grad_norm": 2.5082108974456787, "learning_rate": 8.294336983960184e-07, "loss": 2.0172, "num_input_tokens_seen": 210265056, "step": 206500 }, { "epoch": 4.067649780472919, "grad_norm": 1.8619681596755981, "learning_rate": 8.292785692463426e-07, "loss": 1.9583, "num_input_tokens_seen": 210367456, "step": 206600 }, { "epoch": 4.069618633222421, "grad_norm": 1.8918906450271606, "learning_rate": 8.291233841056235e-07, "loss": 1.9861, "num_input_tokens_seen": 210469856, "step": 206700 }, { "epoch": 4.071587485971924, "grad_norm": 1.8196094036102295, "learning_rate": 8.289681430002487e-07, "loss": 1.9968, "num_input_tokens_seen": 210572256, "step": 206800 }, { "epoch": 4.073556338721427, "grad_norm": 1.860552191734314, "learning_rate": 8.28812845956616e-07, "loss": 1.9992, "num_input_tokens_seen": 210674120, "step": 206900 }, { "epoch": 4.07552519147093, "grad_norm": 3.457857131958008, "learning_rate": 8.28657493001132e-07, "loss": 1.9544, "num_input_tokens_seen": 210776520, "step": 207000 }, { "epoch": 4.077494044220432, "grad_norm": 1.7468923330307007, "learning_rate": 8.285020841602135e-07, "loss": 2.0996, "num_input_tokens_seen": 210877536, "step": 207100 }, { "epoch": 4.079462896969936, "grad_norm": 1.8271055221557617, "learning_rate": 8.283466194602862e-07, "loss": 1.9664, "num_input_tokens_seen": 210979416, "step": 207200 }, { "epoch": 4.081431749719439, "grad_norm": 1.7764694690704346, "learning_rate": 8.281910989277858e-07, "loss": 2.0187, "num_input_tokens_seen": 211081816, "step": 207300 }, { "epoch": 4.0834006024689415, "grad_norm": 1.8398135900497437, "learning_rate": 8.28035522589157e-07, "loss": 1.9931, "num_input_tokens_seen": 211182888, "step": 207400 }, { "epoch": 4.085369455218444, "grad_norm": 1.9261059761047363, "learning_rate": 8.278798904708543e-07, "loss": 1.9812, "num_input_tokens_seen": 211285288, "step": 207500 }, { "epoch": 4.087338307967947, "grad_norm": 2.3591842651367188, "learning_rate": 8.277242025993418e-07, "loss": 1.9635, "num_input_tokens_seen": 211387040, "step": 207600 }, { "epoch": 4.08930716071745, "grad_norm": 1.8212112188339233, "learning_rate": 8.275684590010928e-07, "loss": 1.9566, "num_input_tokens_seen": 211488600, "step": 207700 }, { "epoch": 4.091276013466953, "grad_norm": 2.126899242401123, "learning_rate": 8.274126597025901e-07, "loss": 1.9676, "num_input_tokens_seen": 211591000, "step": 207800 }, { "epoch": 4.093244866216455, "grad_norm": 2.0518031120300293, "learning_rate": 8.272568047303263e-07, "loss": 1.9952, "num_input_tokens_seen": 211693224, "step": 207900 }, { "epoch": 4.095213718965958, "grad_norm": 1.884930968284607, "learning_rate": 8.27100894110803e-07, "loss": 1.9882, "num_input_tokens_seen": 211793832, "step": 208000 }, { "epoch": 4.097182571715462, "grad_norm": 2.251709222793579, "learning_rate": 8.269449278705315e-07, "loss": 1.9612, "num_input_tokens_seen": 211896232, "step": 208100 }, { "epoch": 4.0991514244649645, "grad_norm": 1.875095248222351, "learning_rate": 8.267889060360327e-07, "loss": 2.0129, "num_input_tokens_seen": 211996336, "step": 208200 }, { "epoch": 4.101120277214467, "grad_norm": 1.8317281007766724, "learning_rate": 8.266328286338367e-07, "loss": 1.9711, "num_input_tokens_seen": 212096544, "step": 208300 }, { "epoch": 4.10308912996397, "grad_norm": 1.7605141401290894, "learning_rate": 8.264766956904829e-07, "loss": 2.0407, "num_input_tokens_seen": 212197544, "step": 208400 }, { "epoch": 4.105057982713473, "grad_norm": 2.077226161956787, "learning_rate": 8.263205072325208e-07, "loss": 2.0048, "num_input_tokens_seen": 212298336, "step": 208500 }, { "epoch": 4.1070268354629755, "grad_norm": 2.268589973449707, "learning_rate": 8.261642632865088e-07, "loss": 1.9792, "num_input_tokens_seen": 212400512, "step": 208600 }, { "epoch": 4.108995688212478, "grad_norm": 1.9634552001953125, "learning_rate": 8.260079638790147e-07, "loss": 1.982, "num_input_tokens_seen": 212502912, "step": 208700 }, { "epoch": 4.110964540961981, "grad_norm": 1.8080847263336182, "learning_rate": 8.258516090366161e-07, "loss": 2.0355, "num_input_tokens_seen": 212605312, "step": 208800 }, { "epoch": 4.112933393711485, "grad_norm": 1.8589696884155273, "learning_rate": 8.256951987858998e-07, "loss": 2.001, "num_input_tokens_seen": 212706944, "step": 208900 }, { "epoch": 4.1149022464609875, "grad_norm": 2.4436771869659424, "learning_rate": 8.255387331534619e-07, "loss": 1.9415, "num_input_tokens_seen": 212809344, "step": 209000 }, { "epoch": 4.11687109921049, "grad_norm": 1.895723581314087, "learning_rate": 8.253822121659082e-07, "loss": 2.0208, "num_input_tokens_seen": 212910824, "step": 209100 }, { "epoch": 4.118839951959993, "grad_norm": 1.7055015563964844, "learning_rate": 8.252256358498538e-07, "loss": 1.942, "num_input_tokens_seen": 213013224, "step": 209200 }, { "epoch": 4.120808804709496, "grad_norm": 1.7317379713058472, "learning_rate": 8.250690042319229e-07, "loss": 1.9911, "num_input_tokens_seen": 213115624, "step": 209300 }, { "epoch": 4.1227776574589985, "grad_norm": 2.1245877742767334, "learning_rate": 8.249123173387497e-07, "loss": 2.0294, "num_input_tokens_seen": 213215712, "step": 209400 }, { "epoch": 4.124746510208501, "grad_norm": 2.0705044269561768, "learning_rate": 8.247555751969773e-07, "loss": 2.0028, "num_input_tokens_seen": 213317184, "step": 209500 }, { "epoch": 4.126715362958004, "grad_norm": 1.9951545000076294, "learning_rate": 8.245987778332586e-07, "loss": 1.9659, "num_input_tokens_seen": 213418608, "step": 209600 }, { "epoch": 4.128684215707508, "grad_norm": 1.8356311321258545, "learning_rate": 8.244419252742553e-07, "loss": 1.9893, "num_input_tokens_seen": 213521008, "step": 209700 }, { "epoch": 4.13065306845701, "grad_norm": 2.0035300254821777, "learning_rate": 8.242850175466392e-07, "loss": 2.003, "num_input_tokens_seen": 213622856, "step": 209800 }, { "epoch": 4.132621921206513, "grad_norm": 1.7533024549484253, "learning_rate": 8.24128054677091e-07, "loss": 1.9662, "num_input_tokens_seen": 213724440, "step": 209900 }, { "epoch": 4.134590773956016, "grad_norm": 1.7321463823318481, "learning_rate": 8.23971036692301e-07, "loss": 1.9721, "num_input_tokens_seen": 213826840, "step": 210000 }, { "epoch": 4.136559626705519, "grad_norm": 1.9174144268035889, "learning_rate": 8.238139636189687e-07, "loss": 1.9552, "num_input_tokens_seen": 213929240, "step": 210100 }, { "epoch": 4.1385284794550214, "grad_norm": 2.0311577320098877, "learning_rate": 8.23656835483803e-07, "loss": 1.9565, "num_input_tokens_seen": 214031640, "step": 210200 }, { "epoch": 4.140497332204524, "grad_norm": 1.895086646080017, "learning_rate": 8.234996523135224e-07, "loss": 1.9584, "num_input_tokens_seen": 214134040, "step": 210300 }, { "epoch": 4.142466184954027, "grad_norm": 1.6728761196136475, "learning_rate": 8.233424141348542e-07, "loss": 1.949, "num_input_tokens_seen": 214236440, "step": 210400 }, { "epoch": 4.14443503770353, "grad_norm": 2.218950033187866, "learning_rate": 8.23185120974536e-07, "loss": 1.9878, "num_input_tokens_seen": 214337696, "step": 210500 }, { "epoch": 4.146403890453033, "grad_norm": 1.7907037734985352, "learning_rate": 8.230277728593136e-07, "loss": 2.0009, "num_input_tokens_seen": 214439176, "step": 210600 }, { "epoch": 4.148372743202536, "grad_norm": 2.00350022315979, "learning_rate": 8.228703698159429e-07, "loss": 1.9421, "num_input_tokens_seen": 214540328, "step": 210700 }, { "epoch": 4.150341595952039, "grad_norm": 1.9227792024612427, "learning_rate": 8.227129118711892e-07, "loss": 1.9908, "num_input_tokens_seen": 214641656, "step": 210800 }, { "epoch": 4.152310448701542, "grad_norm": 1.8052361011505127, "learning_rate": 8.225553990518267e-07, "loss": 1.9454, "num_input_tokens_seen": 214744056, "step": 210900 }, { "epoch": 4.154279301451044, "grad_norm": 2.1096808910369873, "learning_rate": 8.223978313846392e-07, "loss": 2.0, "num_input_tokens_seen": 214845760, "step": 211000 }, { "epoch": 4.156248154200547, "grad_norm": 1.8590428829193115, "learning_rate": 8.222402088964195e-07, "loss": 2.0573, "num_input_tokens_seen": 214947368, "step": 211100 }, { "epoch": 4.15821700695005, "grad_norm": 1.908556342124939, "learning_rate": 8.220825316139703e-07, "loss": 1.929, "num_input_tokens_seen": 215049768, "step": 211200 }, { "epoch": 4.160185859699553, "grad_norm": 1.777230978012085, "learning_rate": 8.219247995641032e-07, "loss": 1.9528, "num_input_tokens_seen": 215151352, "step": 211300 }, { "epoch": 4.162154712449056, "grad_norm": 6.116434574127197, "learning_rate": 8.217670127736391e-07, "loss": 1.9722, "num_input_tokens_seen": 215252352, "step": 211400 }, { "epoch": 4.164123565198559, "grad_norm": 2.3004748821258545, "learning_rate": 8.216091712694086e-07, "loss": 2.0707, "num_input_tokens_seen": 215353984, "step": 211500 }, { "epoch": 4.166092417948062, "grad_norm": 1.8602970838546753, "learning_rate": 8.214512750782509e-07, "loss": 1.9643, "num_input_tokens_seen": 215455768, "step": 211600 }, { "epoch": 4.168061270697565, "grad_norm": 1.7709438800811768, "learning_rate": 8.212933242270151e-07, "loss": 1.9846, "num_input_tokens_seen": 215557464, "step": 211700 }, { "epoch": 4.170030123447067, "grad_norm": 1.8944815397262573, "learning_rate": 8.211353187425593e-07, "loss": 1.9754, "num_input_tokens_seen": 215658584, "step": 211800 }, { "epoch": 4.17199897619657, "grad_norm": 2.141850471496582, "learning_rate": 8.209772586517513e-07, "loss": 1.9906, "num_input_tokens_seen": 215760984, "step": 211900 }, { "epoch": 4.173967828946073, "grad_norm": 2.012176036834717, "learning_rate": 8.208191439814679e-07, "loss": 2.0133, "num_input_tokens_seen": 215862536, "step": 212000 }, { "epoch": 4.175936681695576, "grad_norm": 2.2813560962677, "learning_rate": 8.206609747585949e-07, "loss": 2.0069, "num_input_tokens_seen": 215963792, "step": 212100 }, { "epoch": 4.177905534445079, "grad_norm": 2.2592687606811523, "learning_rate": 8.205027510100275e-07, "loss": 1.9822, "num_input_tokens_seen": 216066192, "step": 212200 }, { "epoch": 4.179874387194582, "grad_norm": 2.3675546646118164, "learning_rate": 8.203444727626708e-07, "loss": 1.9829, "num_input_tokens_seen": 216168000, "step": 212300 }, { "epoch": 4.181843239944085, "grad_norm": 1.8685001134872437, "learning_rate": 8.201861400434382e-07, "loss": 2.0066, "num_input_tokens_seen": 216268720, "step": 212400 }, { "epoch": 4.183812092693588, "grad_norm": 2.0438554286956787, "learning_rate": 8.200277528792531e-07, "loss": 1.9843, "num_input_tokens_seen": 216371120, "step": 212500 }, { "epoch": 4.18578094544309, "grad_norm": 2.06326961517334, "learning_rate": 8.198693112970478e-07, "loss": 1.9634, "num_input_tokens_seen": 216473520, "step": 212600 }, { "epoch": 4.187749798192593, "grad_norm": 1.8550713062286377, "learning_rate": 8.197108153237642e-07, "loss": 1.9755, "num_input_tokens_seen": 216575920, "step": 212700 }, { "epoch": 4.189718650942096, "grad_norm": 1.854211688041687, "learning_rate": 8.19552264986353e-07, "loss": 1.9761, "num_input_tokens_seen": 216678320, "step": 212800 }, { "epoch": 4.191687503691599, "grad_norm": 2.236013889312744, "learning_rate": 8.193936603117741e-07, "loss": 1.9775, "num_input_tokens_seen": 216780072, "step": 212900 }, { "epoch": 4.193656356441101, "grad_norm": 1.5717908143997192, "learning_rate": 8.192350013269975e-07, "loss": 2.0049, "num_input_tokens_seen": 216882472, "step": 213000 }, { "epoch": 4.195625209190605, "grad_norm": 1.8222144842147827, "learning_rate": 8.190762880590013e-07, "loss": 2.0121, "num_input_tokens_seen": 216984072, "step": 213100 }, { "epoch": 4.197594061940108, "grad_norm": 2.036623477935791, "learning_rate": 8.189175205347735e-07, "loss": 2.0088, "num_input_tokens_seen": 217085240, "step": 213200 }, { "epoch": 4.1995629146896105, "grad_norm": 1.8548247814178467, "learning_rate": 8.187586987813113e-07, "loss": 1.9197, "num_input_tokens_seen": 217186816, "step": 213300 }, { "epoch": 4.201531767439113, "grad_norm": 1.9620369672775269, "learning_rate": 8.185998228256209e-07, "loss": 1.9741, "num_input_tokens_seen": 217288736, "step": 213400 }, { "epoch": 4.203500620188616, "grad_norm": 2.084591865539551, "learning_rate": 8.184408926947179e-07, "loss": 1.9746, "num_input_tokens_seen": 217391136, "step": 213500 }, { "epoch": 4.205469472938119, "grad_norm": 1.709738850593567, "learning_rate": 8.182819084156268e-07, "loss": 1.955, "num_input_tokens_seen": 217493536, "step": 213600 }, { "epoch": 4.2074383256876215, "grad_norm": 2.3697879314422607, "learning_rate": 8.181228700153817e-07, "loss": 1.9938, "num_input_tokens_seen": 217595096, "step": 213700 }, { "epoch": 4.209407178437124, "grad_norm": 1.8364559412002563, "learning_rate": 8.179637775210258e-07, "loss": 1.9411, "num_input_tokens_seen": 217697496, "step": 213800 }, { "epoch": 4.211376031186628, "grad_norm": 2.0698494911193848, "learning_rate": 8.178046309596111e-07, "loss": 2.0071, "num_input_tokens_seen": 217799896, "step": 213900 }, { "epoch": 4.213344883936131, "grad_norm": 2.2861826419830322, "learning_rate": 8.176454303581998e-07, "loss": 2.0019, "num_input_tokens_seen": 217902296, "step": 214000 }, { "epoch": 4.2153137366856335, "grad_norm": 2.09855055809021, "learning_rate": 8.17486175743862e-07, "loss": 1.9549, "num_input_tokens_seen": 218004696, "step": 214100 }, { "epoch": 4.217282589435136, "grad_norm": 1.7645584344863892, "learning_rate": 8.173268671436779e-07, "loss": 1.9808, "num_input_tokens_seen": 218105096, "step": 214200 }, { "epoch": 4.219251442184639, "grad_norm": 2.2229950428009033, "learning_rate": 8.171675045847363e-07, "loss": 2.0014, "num_input_tokens_seen": 218207496, "step": 214300 }, { "epoch": 4.221220294934142, "grad_norm": 2.069821834564209, "learning_rate": 8.170080880941359e-07, "loss": 1.9452, "num_input_tokens_seen": 218309896, "step": 214400 }, { "epoch": 4.2231891476836445, "grad_norm": 2.1184885501861572, "learning_rate": 8.168486176989838e-07, "loss": 2.0071, "num_input_tokens_seen": 218411864, "step": 214500 }, { "epoch": 4.225158000433147, "grad_norm": 2.1211719512939453, "learning_rate": 8.166890934263968e-07, "loss": 1.9598, "num_input_tokens_seen": 218513744, "step": 214600 }, { "epoch": 4.22712685318265, "grad_norm": 1.9384825229644775, "learning_rate": 8.165295153035003e-07, "loss": 1.922, "num_input_tokens_seen": 218616144, "step": 214700 }, { "epoch": 4.229095705932154, "grad_norm": 2.017470598220825, "learning_rate": 8.163698833574297e-07, "loss": 2.0144, "num_input_tokens_seen": 218718544, "step": 214800 }, { "epoch": 4.231064558681656, "grad_norm": 2.218912124633789, "learning_rate": 8.162101976153286e-07, "loss": 1.997, "num_input_tokens_seen": 218820944, "step": 214900 }, { "epoch": 4.233033411431159, "grad_norm": 2.1257431507110596, "learning_rate": 8.160504581043506e-07, "loss": 1.987, "num_input_tokens_seen": 218923128, "step": 215000 }, { "epoch": 4.235002264180662, "grad_norm": 1.788659691810608, "learning_rate": 8.158906648516578e-07, "loss": 1.928, "num_input_tokens_seen": 219024752, "step": 215100 }, { "epoch": 4.236971116930165, "grad_norm": 1.8117871284484863, "learning_rate": 8.157308178844218e-07, "loss": 2.0293, "num_input_tokens_seen": 219126840, "step": 215200 }, { "epoch": 4.2389399696796675, "grad_norm": 1.7583394050598145, "learning_rate": 8.155709172298233e-07, "loss": 2.0154, "num_input_tokens_seen": 219229240, "step": 215300 }, { "epoch": 4.24090882242917, "grad_norm": 1.9496679306030273, "learning_rate": 8.154109629150518e-07, "loss": 1.9819, "num_input_tokens_seen": 219331224, "step": 215400 }, { "epoch": 4.242877675178673, "grad_norm": 1.9294391870498657, "learning_rate": 8.152509549673066e-07, "loss": 1.9872, "num_input_tokens_seen": 219432704, "step": 215500 }, { "epoch": 4.244846527928177, "grad_norm": 1.7344541549682617, "learning_rate": 8.150908934137952e-07, "loss": 1.9877, "num_input_tokens_seen": 219535104, "step": 215600 }, { "epoch": 4.246815380677679, "grad_norm": 3.2139742374420166, "learning_rate": 8.149307782817352e-07, "loss": 1.9592, "num_input_tokens_seen": 219636744, "step": 215700 }, { "epoch": 4.248784233427182, "grad_norm": 1.9002854824066162, "learning_rate": 8.147706095983525e-07, "loss": 1.9351, "num_input_tokens_seen": 219739144, "step": 215800 }, { "epoch": 4.250753086176685, "grad_norm": 1.993449091911316, "learning_rate": 8.146103873908825e-07, "loss": 1.9986, "num_input_tokens_seen": 219841544, "step": 215900 }, { "epoch": 4.252721938926188, "grad_norm": 2.0600881576538086, "learning_rate": 8.144501116865696e-07, "loss": 1.9911, "num_input_tokens_seen": 219943944, "step": 216000 }, { "epoch": 4.25469079167569, "grad_norm": 1.9502677917480469, "learning_rate": 8.142897825126675e-07, "loss": 2.0185, "num_input_tokens_seen": 220046248, "step": 216100 }, { "epoch": 4.256659644425193, "grad_norm": 2.3295066356658936, "learning_rate": 8.141293998964388e-07, "loss": 1.9602, "num_input_tokens_seen": 220148648, "step": 216200 }, { "epoch": 4.258628497174696, "grad_norm": 1.9984863996505737, "learning_rate": 8.13968963865155e-07, "loss": 1.9947, "num_input_tokens_seen": 220250280, "step": 216300 }, { "epoch": 4.2605973499242, "grad_norm": 1.7733039855957031, "learning_rate": 8.138084744460971e-07, "loss": 2.0138, "num_input_tokens_seen": 220351728, "step": 216400 }, { "epoch": 4.262566202673702, "grad_norm": 1.9590157270431519, "learning_rate": 8.136479316665549e-07, "loss": 1.9595, "num_input_tokens_seen": 220453528, "step": 216500 }, { "epoch": 4.264535055423205, "grad_norm": 1.9967883825302124, "learning_rate": 8.134873355538274e-07, "loss": 1.9962, "num_input_tokens_seen": 220554864, "step": 216600 }, { "epoch": 4.266503908172708, "grad_norm": 1.8763107061386108, "learning_rate": 8.133266861352226e-07, "loss": 2.0066, "num_input_tokens_seen": 220656696, "step": 216700 }, { "epoch": 4.268472760922211, "grad_norm": 2.052912712097168, "learning_rate": 8.131659834380574e-07, "loss": 1.9398, "num_input_tokens_seen": 220759096, "step": 216800 }, { "epoch": 4.270441613671713, "grad_norm": 1.9175455570220947, "learning_rate": 8.130052274896581e-07, "loss": 1.9424, "num_input_tokens_seen": 220860648, "step": 216900 }, { "epoch": 4.272410466421216, "grad_norm": 1.7989146709442139, "learning_rate": 8.128444183173599e-07, "loss": 2.0372, "num_input_tokens_seen": 220963048, "step": 217000 }, { "epoch": 4.274379319170719, "grad_norm": 1.8976370096206665, "learning_rate": 8.12683555948507e-07, "loss": 1.9601, "num_input_tokens_seen": 221065448, "step": 217100 }, { "epoch": 4.2763481719202225, "grad_norm": 1.6844745874404907, "learning_rate": 8.125226404104528e-07, "loss": 1.9583, "num_input_tokens_seen": 221167848, "step": 217200 }, { "epoch": 4.278317024669725, "grad_norm": 1.6370481252670288, "learning_rate": 8.123616717305595e-07, "loss": 1.996, "num_input_tokens_seen": 221269760, "step": 217300 }, { "epoch": 4.280285877419228, "grad_norm": 2.272353410720825, "learning_rate": 8.122006499361984e-07, "loss": 1.9959, "num_input_tokens_seen": 221371328, "step": 217400 }, { "epoch": 4.282254730168731, "grad_norm": 2.06241774559021, "learning_rate": 8.120395750547501e-07, "loss": 1.9787, "num_input_tokens_seen": 221473728, "step": 217500 }, { "epoch": 4.284223582918234, "grad_norm": 1.9038654565811157, "learning_rate": 8.11878447113604e-07, "loss": 1.9408, "num_input_tokens_seen": 221575632, "step": 217600 }, { "epoch": 4.286192435667736, "grad_norm": 1.8895894289016724, "learning_rate": 8.117172661401584e-07, "loss": 1.9984, "num_input_tokens_seen": 221677184, "step": 217700 }, { "epoch": 4.288161288417239, "grad_norm": 2.1330461502075195, "learning_rate": 8.115560321618206e-07, "loss": 1.9746, "num_input_tokens_seen": 221779584, "step": 217800 }, { "epoch": 4.290130141166742, "grad_norm": 2.0095295906066895, "learning_rate": 8.113947452060076e-07, "loss": 2.002, "num_input_tokens_seen": 221881984, "step": 217900 }, { "epoch": 4.292098993916245, "grad_norm": 1.8864161968231201, "learning_rate": 8.112334053001444e-07, "loss": 1.985, "num_input_tokens_seen": 221983856, "step": 218000 }, { "epoch": 4.294067846665748, "grad_norm": 2.039607286453247, "learning_rate": 8.110720124716659e-07, "loss": 1.985, "num_input_tokens_seen": 222085744, "step": 218100 }, { "epoch": 4.296036699415251, "grad_norm": 1.9328733682632446, "learning_rate": 8.109105667480153e-07, "loss": 1.9594, "num_input_tokens_seen": 222187536, "step": 218200 }, { "epoch": 4.298005552164754, "grad_norm": 2.0978965759277344, "learning_rate": 8.107490681566451e-07, "loss": 1.982, "num_input_tokens_seen": 222289352, "step": 218300 }, { "epoch": 4.2999744049142565, "grad_norm": 1.7404478788375854, "learning_rate": 8.10587516725017e-07, "loss": 1.9871, "num_input_tokens_seen": 222391752, "step": 218400 }, { "epoch": 4.301943257663759, "grad_norm": 1.785993218421936, "learning_rate": 8.104259124806012e-07, "loss": 1.9659, "num_input_tokens_seen": 222494152, "step": 218500 }, { "epoch": 4.303912110413262, "grad_norm": 2.1978917121887207, "learning_rate": 8.102642554508772e-07, "loss": 1.9512, "num_input_tokens_seen": 222595672, "step": 218600 }, { "epoch": 4.305880963162765, "grad_norm": 1.7965723276138306, "learning_rate": 8.101025456633334e-07, "loss": 1.9729, "num_input_tokens_seen": 222698072, "step": 218700 }, { "epoch": 4.307849815912268, "grad_norm": 2.0766849517822266, "learning_rate": 8.099407831454674e-07, "loss": 2.0122, "num_input_tokens_seen": 222798968, "step": 218800 }, { "epoch": 4.309818668661771, "grad_norm": 2.1054444313049316, "learning_rate": 8.097789679247853e-07, "loss": 1.959, "num_input_tokens_seen": 222901368, "step": 218900 }, { "epoch": 4.311787521411274, "grad_norm": 1.8245131969451904, "learning_rate": 8.096171000288025e-07, "loss": 2.0052, "num_input_tokens_seen": 223003768, "step": 219000 }, { "epoch": 4.313756374160777, "grad_norm": 1.8209213018417358, "learning_rate": 8.094551794850432e-07, "loss": 2.0242, "num_input_tokens_seen": 223105800, "step": 219100 }, { "epoch": 4.3157252269102795, "grad_norm": 1.971667766571045, "learning_rate": 8.092932063210409e-07, "loss": 1.9974, "num_input_tokens_seen": 223208200, "step": 219200 }, { "epoch": 4.317694079659782, "grad_norm": 2.0280184745788574, "learning_rate": 8.091311805643375e-07, "loss": 2.0031, "num_input_tokens_seen": 223310600, "step": 219300 }, { "epoch": 4.319662932409285, "grad_norm": 2.1408958435058594, "learning_rate": 8.089691022424841e-07, "loss": 1.9802, "num_input_tokens_seen": 223413000, "step": 219400 }, { "epoch": 4.321631785158788, "grad_norm": 2.1364798545837402, "learning_rate": 8.088069713830408e-07, "loss": 1.9982, "num_input_tokens_seen": 223514864, "step": 219500 }, { "epoch": 4.3236006379082905, "grad_norm": 2.9700281620025635, "learning_rate": 8.086447880135767e-07, "loss": 1.9846, "num_input_tokens_seen": 223616984, "step": 219600 }, { "epoch": 4.325569490657793, "grad_norm": 2.3307693004608154, "learning_rate": 8.084825521616696e-07, "loss": 1.9966, "num_input_tokens_seen": 223717896, "step": 219700 }, { "epoch": 4.327538343407297, "grad_norm": 5.6779255867004395, "learning_rate": 8.083202638549063e-07, "loss": 2.0122, "num_input_tokens_seen": 223819536, "step": 219800 }, { "epoch": 4.3295071961568, "grad_norm": 1.9002426862716675, "learning_rate": 8.081579231208827e-07, "loss": 1.9533, "num_input_tokens_seen": 223921936, "step": 219900 }, { "epoch": 4.3314760489063024, "grad_norm": 1.9026561975479126, "learning_rate": 8.079955299872034e-07, "loss": 1.956, "num_input_tokens_seen": 224023568, "step": 220000 }, { "epoch": 4.333444901655805, "grad_norm": 1.7311137914657593, "learning_rate": 8.078330844814819e-07, "loss": 1.9851, "num_input_tokens_seen": 224125272, "step": 220100 }, { "epoch": 4.335413754405308, "grad_norm": 1.9070080518722534, "learning_rate": 8.076705866313408e-07, "loss": 2.0065, "num_input_tokens_seen": 224227672, "step": 220200 }, { "epoch": 4.337382607154811, "grad_norm": 1.7994564771652222, "learning_rate": 8.075080364644116e-07, "loss": 2.0414, "num_input_tokens_seen": 224329800, "step": 220300 }, { "epoch": 4.3393514599043135, "grad_norm": 1.876141905784607, "learning_rate": 8.073454340083342e-07, "loss": 2.0027, "num_input_tokens_seen": 224431352, "step": 220400 }, { "epoch": 4.341320312653816, "grad_norm": 1.9209034442901611, "learning_rate": 8.071827792907582e-07, "loss": 1.9898, "num_input_tokens_seen": 224532992, "step": 220500 }, { "epoch": 4.34328916540332, "grad_norm": 2.1137211322784424, "learning_rate": 8.070200723393414e-07, "loss": 2.0278, "num_input_tokens_seen": 224634448, "step": 220600 }, { "epoch": 4.345258018152823, "grad_norm": 1.9331674575805664, "learning_rate": 8.068573131817508e-07, "loss": 2.0014, "num_input_tokens_seen": 224735272, "step": 220700 }, { "epoch": 4.347226870902325, "grad_norm": 1.8493545055389404, "learning_rate": 8.066945018456623e-07, "loss": 1.9861, "num_input_tokens_seen": 224836856, "step": 220800 }, { "epoch": 4.349195723651828, "grad_norm": 1.6915216445922852, "learning_rate": 8.065316383587606e-07, "loss": 1.991, "num_input_tokens_seen": 224938760, "step": 220900 }, { "epoch": 4.351164576401331, "grad_norm": 2.0094316005706787, "learning_rate": 8.06368722748739e-07, "loss": 1.9606, "num_input_tokens_seen": 225040864, "step": 221000 }, { "epoch": 4.353133429150834, "grad_norm": 2.79548978805542, "learning_rate": 8.062057550433003e-07, "loss": 1.9728, "num_input_tokens_seen": 225143264, "step": 221100 }, { "epoch": 4.355102281900336, "grad_norm": 1.8829996585845947, "learning_rate": 8.060427352701557e-07, "loss": 2.0219, "num_input_tokens_seen": 225245080, "step": 221200 }, { "epoch": 4.357071134649839, "grad_norm": 2.938256025314331, "learning_rate": 8.058796634570251e-07, "loss": 1.9436, "num_input_tokens_seen": 225346856, "step": 221300 }, { "epoch": 4.359039987399342, "grad_norm": 1.8306713104248047, "learning_rate": 8.057165396316377e-07, "loss": 2.0174, "num_input_tokens_seen": 225448656, "step": 221400 }, { "epoch": 4.361008840148846, "grad_norm": 2.3685410022735596, "learning_rate": 8.055533638217312e-07, "loss": 1.9718, "num_input_tokens_seen": 225551056, "step": 221500 }, { "epoch": 4.362977692898348, "grad_norm": 1.8687312602996826, "learning_rate": 8.053901360550523e-07, "loss": 1.9771, "num_input_tokens_seen": 225652584, "step": 221600 }, { "epoch": 4.364946545647851, "grad_norm": 2.394420862197876, "learning_rate": 8.052268563593567e-07, "loss": 2.0222, "num_input_tokens_seen": 225754984, "step": 221700 }, { "epoch": 4.366915398397354, "grad_norm": 2.370257616043091, "learning_rate": 8.050635247624086e-07, "loss": 1.9827, "num_input_tokens_seen": 225856552, "step": 221800 }, { "epoch": 4.368884251146857, "grad_norm": 1.8775486946105957, "learning_rate": 8.04900141291981e-07, "loss": 1.9879, "num_input_tokens_seen": 225958496, "step": 221900 }, { "epoch": 4.370853103896359, "grad_norm": 1.8185542821884155, "learning_rate": 8.047367059758559e-07, "loss": 2.0074, "num_input_tokens_seen": 226060896, "step": 222000 }, { "epoch": 4.372821956645862, "grad_norm": 2.056126117706299, "learning_rate": 8.045732188418244e-07, "loss": 1.9692, "num_input_tokens_seen": 226161880, "step": 222100 }, { "epoch": 4.374790809395365, "grad_norm": 1.9742474555969238, "learning_rate": 8.04409679917686e-07, "loss": 1.9683, "num_input_tokens_seen": 226263008, "step": 222200 }, { "epoch": 4.376759662144869, "grad_norm": 1.935840129852295, "learning_rate": 8.04246089231249e-07, "loss": 1.9961, "num_input_tokens_seen": 226364176, "step": 222300 }, { "epoch": 4.378728514894371, "grad_norm": 1.9568092823028564, "learning_rate": 8.040824468103306e-07, "loss": 1.982, "num_input_tokens_seen": 226466576, "step": 222400 }, { "epoch": 4.380697367643874, "grad_norm": 1.8954073190689087, "learning_rate": 8.039187526827568e-07, "loss": 1.996, "num_input_tokens_seen": 226568976, "step": 222500 }, { "epoch": 4.382666220393377, "grad_norm": 2.0917599201202393, "learning_rate": 8.037550068763625e-07, "loss": 1.9948, "num_input_tokens_seen": 226671376, "step": 222600 }, { "epoch": 4.38463507314288, "grad_norm": 1.7765631675720215, "learning_rate": 8.035912094189914e-07, "loss": 1.9881, "num_input_tokens_seen": 226773776, "step": 222700 }, { "epoch": 4.386603925892382, "grad_norm": 5.36970853805542, "learning_rate": 8.034273603384956e-07, "loss": 1.9644, "num_input_tokens_seen": 226876176, "step": 222800 }, { "epoch": 4.388572778641885, "grad_norm": 1.9368995428085327, "learning_rate": 8.032634596627366e-07, "loss": 1.934, "num_input_tokens_seen": 226976896, "step": 222900 }, { "epoch": 4.390541631391388, "grad_norm": 2.9717519283294678, "learning_rate": 8.030995074195839e-07, "loss": 2.0059, "num_input_tokens_seen": 227078272, "step": 223000 }, { "epoch": 4.3925104841408915, "grad_norm": 1.9175190925598145, "learning_rate": 8.029355036369164e-07, "loss": 1.9639, "num_input_tokens_seen": 227180088, "step": 223100 }, { "epoch": 4.394479336890394, "grad_norm": 1.7901525497436523, "learning_rate": 8.027714483426218e-07, "loss": 2.0324, "num_input_tokens_seen": 227281896, "step": 223200 }, { "epoch": 4.396448189639897, "grad_norm": 2.0427134037017822, "learning_rate": 8.02607341564596e-07, "loss": 2.0029, "num_input_tokens_seen": 227383400, "step": 223300 }, { "epoch": 4.3984170423894, "grad_norm": 1.894559383392334, "learning_rate": 8.02443183330744e-07, "loss": 1.9632, "num_input_tokens_seen": 227485800, "step": 223400 }, { "epoch": 4.4003858951389025, "grad_norm": 1.9449197053909302, "learning_rate": 8.022789736689796e-07, "loss": 1.97, "num_input_tokens_seen": 227587272, "step": 223500 }, { "epoch": 4.402354747888405, "grad_norm": 1.855339765548706, "learning_rate": 8.021147126072254e-07, "loss": 2.0101, "num_input_tokens_seen": 227688816, "step": 223600 }, { "epoch": 4.404323600637908, "grad_norm": 3.689824104309082, "learning_rate": 8.019504001734124e-07, "loss": 2.0184, "num_input_tokens_seen": 227789008, "step": 223700 }, { "epoch": 4.406292453387411, "grad_norm": 1.8142951726913452, "learning_rate": 8.017860363954807e-07, "loss": 1.9591, "num_input_tokens_seen": 227890552, "step": 223800 }, { "epoch": 4.4082613061369145, "grad_norm": 1.777241587638855, "learning_rate": 8.016216213013788e-07, "loss": 1.9961, "num_input_tokens_seen": 227992664, "step": 223900 }, { "epoch": 4.410230158886417, "grad_norm": 2.256772518157959, "learning_rate": 8.014571549190642e-07, "loss": 2.0172, "num_input_tokens_seen": 228094568, "step": 224000 }, { "epoch": 4.41219901163592, "grad_norm": 2.0180375576019287, "learning_rate": 8.012926372765029e-07, "loss": 2.0225, "num_input_tokens_seen": 228196104, "step": 224100 }, { "epoch": 4.414167864385423, "grad_norm": 2.085426092147827, "learning_rate": 8.011280684016699e-07, "loss": 1.9956, "num_input_tokens_seen": 228298504, "step": 224200 }, { "epoch": 4.4161367171349255, "grad_norm": 1.8967337608337402, "learning_rate": 8.009634483225489e-07, "loss": 1.9948, "num_input_tokens_seen": 228399888, "step": 224300 }, { "epoch": 4.418105569884428, "grad_norm": 2.135732650756836, "learning_rate": 8.007987770671318e-07, "loss": 2.0094, "num_input_tokens_seen": 228500976, "step": 224400 }, { "epoch": 4.420074422633931, "grad_norm": 1.991890788078308, "learning_rate": 8.006340546634197e-07, "loss": 1.9365, "num_input_tokens_seen": 228603376, "step": 224500 }, { "epoch": 4.422043275383434, "grad_norm": 2.9969050884246826, "learning_rate": 8.004692811394223e-07, "loss": 2.0129, "num_input_tokens_seen": 228705776, "step": 224600 }, { "epoch": 4.4240121281329365, "grad_norm": 1.8463053703308105, "learning_rate": 8.00304456523158e-07, "loss": 1.9556, "num_input_tokens_seen": 228808176, "step": 224700 }, { "epoch": 4.42598098088244, "grad_norm": 1.9226841926574707, "learning_rate": 8.001395808426537e-07, "loss": 1.9627, "num_input_tokens_seen": 228909720, "step": 224800 }, { "epoch": 4.427949833631943, "grad_norm": 1.9595040082931519, "learning_rate": 7.999746541259451e-07, "loss": 2.0559, "num_input_tokens_seen": 229012120, "step": 224900 }, { "epoch": 4.429918686381446, "grad_norm": 1.708908200263977, "learning_rate": 7.998096764010767e-07, "loss": 1.9864, "num_input_tokens_seen": 229113904, "step": 225000 }, { "epoch": 4.4318875391309485, "grad_norm": 1.9762471914291382, "learning_rate": 7.996446476961015e-07, "loss": 1.9841, "num_input_tokens_seen": 229216120, "step": 225100 }, { "epoch": 4.433856391880451, "grad_norm": 1.8990074396133423, "learning_rate": 7.994795680390814e-07, "loss": 1.9934, "num_input_tokens_seen": 229317288, "step": 225200 }, { "epoch": 4.435825244629954, "grad_norm": 2.5449118614196777, "learning_rate": 7.993144374580866e-07, "loss": 1.9653, "num_input_tokens_seen": 229418184, "step": 225300 }, { "epoch": 4.437794097379457, "grad_norm": 3.2167248725891113, "learning_rate": 7.991492559811964e-07, "loss": 1.9907, "num_input_tokens_seen": 229520584, "step": 225400 }, { "epoch": 4.4397629501289595, "grad_norm": 1.797454595565796, "learning_rate": 7.989840236364981e-07, "loss": 1.9663, "num_input_tokens_seen": 229622984, "step": 225500 }, { "epoch": 4.441731802878463, "grad_norm": 1.874466896057129, "learning_rate": 7.988187404520884e-07, "loss": 2.0149, "num_input_tokens_seen": 229724216, "step": 225600 }, { "epoch": 4.443700655627966, "grad_norm": 2.0009689331054688, "learning_rate": 7.986534064560723e-07, "loss": 2.0108, "num_input_tokens_seen": 229825784, "step": 225700 }, { "epoch": 4.445669508377469, "grad_norm": 1.888047695159912, "learning_rate": 7.984880216765635e-07, "loss": 2.0118, "num_input_tokens_seen": 229926592, "step": 225800 }, { "epoch": 4.447638361126971, "grad_norm": 1.9812793731689453, "learning_rate": 7.98322586141684e-07, "loss": 1.9749, "num_input_tokens_seen": 230027664, "step": 225900 }, { "epoch": 4.449607213876474, "grad_norm": 1.8331326246261597, "learning_rate": 7.98157099879565e-07, "loss": 2.0502, "num_input_tokens_seen": 230129584, "step": 226000 }, { "epoch": 4.451576066625977, "grad_norm": 1.9073522090911865, "learning_rate": 7.979915629183458e-07, "loss": 1.9923, "num_input_tokens_seen": 230230648, "step": 226100 }, { "epoch": 4.45354491937548, "grad_norm": 1.8403912782669067, "learning_rate": 7.978259752861747e-07, "loss": 1.9725, "num_input_tokens_seen": 230332176, "step": 226200 }, { "epoch": 4.4555137721249825, "grad_norm": 1.805899977684021, "learning_rate": 7.976603370112087e-07, "loss": 2.0041, "num_input_tokens_seen": 230433304, "step": 226300 }, { "epoch": 4.457482624874485, "grad_norm": 3.5743937492370605, "learning_rate": 7.974946481216128e-07, "loss": 2.0362, "num_input_tokens_seen": 230532976, "step": 226400 }, { "epoch": 4.459451477623989, "grad_norm": 2.0350921154022217, "learning_rate": 7.973289086455612e-07, "loss": 1.987, "num_input_tokens_seen": 230634808, "step": 226500 }, { "epoch": 4.461420330373492, "grad_norm": 1.9272820949554443, "learning_rate": 7.971631186112367e-07, "loss": 1.9851, "num_input_tokens_seen": 230736392, "step": 226600 }, { "epoch": 4.463389183122994, "grad_norm": 2.0048458576202393, "learning_rate": 7.969972780468299e-07, "loss": 1.9943, "num_input_tokens_seen": 230838792, "step": 226700 }, { "epoch": 4.465358035872497, "grad_norm": 1.8114912509918213, "learning_rate": 7.968313869805412e-07, "loss": 1.9732, "num_input_tokens_seen": 230941192, "step": 226800 }, { "epoch": 4.467326888622, "grad_norm": 2.2808680534362793, "learning_rate": 7.966654454405785e-07, "loss": 2.0134, "num_input_tokens_seen": 231042680, "step": 226900 }, { "epoch": 4.469295741371503, "grad_norm": 1.9703865051269531, "learning_rate": 7.964994534551592e-07, "loss": 1.9556, "num_input_tokens_seen": 231145080, "step": 227000 }, { "epoch": 4.471264594121005, "grad_norm": 2.1411333084106445, "learning_rate": 7.963334110525086e-07, "loss": 1.9598, "num_input_tokens_seen": 231247480, "step": 227100 }, { "epoch": 4.473233446870508, "grad_norm": 1.9498172998428345, "learning_rate": 7.961673182608609e-07, "loss": 2.0634, "num_input_tokens_seen": 231348544, "step": 227200 }, { "epoch": 4.475202299620012, "grad_norm": 1.9369093179702759, "learning_rate": 7.960011751084587e-07, "loss": 2.003, "num_input_tokens_seen": 231450944, "step": 227300 }, { "epoch": 4.477171152369515, "grad_norm": 1.945993423461914, "learning_rate": 7.958349816235533e-07, "loss": 1.9968, "num_input_tokens_seen": 231551472, "step": 227400 }, { "epoch": 4.479140005119017, "grad_norm": 1.9946876764297485, "learning_rate": 7.956687378344043e-07, "loss": 1.9972, "num_input_tokens_seen": 231652656, "step": 227500 }, { "epoch": 4.48110885786852, "grad_norm": 2.0146830081939697, "learning_rate": 7.955024437692803e-07, "loss": 1.9865, "num_input_tokens_seen": 231754496, "step": 227600 }, { "epoch": 4.483077710618023, "grad_norm": 1.8295978307724, "learning_rate": 7.953360994564581e-07, "loss": 1.9826, "num_input_tokens_seen": 231856896, "step": 227700 }, { "epoch": 4.485046563367526, "grad_norm": 1.7086530923843384, "learning_rate": 7.951697049242232e-07, "loss": 1.9761, "num_input_tokens_seen": 231958848, "step": 227800 }, { "epoch": 4.487015416117028, "grad_norm": 1.8575228452682495, "learning_rate": 7.950032602008695e-07, "loss": 1.9852, "num_input_tokens_seen": 232061248, "step": 227900 }, { "epoch": 4.488984268866531, "grad_norm": 2.264408826828003, "learning_rate": 7.948367653146997e-07, "loss": 1.9611, "num_input_tokens_seen": 232163648, "step": 228000 }, { "epoch": 4.490953121616034, "grad_norm": 1.838910698890686, "learning_rate": 7.946702202940246e-07, "loss": 1.9767, "num_input_tokens_seen": 232265032, "step": 228100 }, { "epoch": 4.4929219743655375, "grad_norm": 2.100069522857666, "learning_rate": 7.945036251671639e-07, "loss": 1.9756, "num_input_tokens_seen": 232367432, "step": 228200 }, { "epoch": 4.49489082711504, "grad_norm": 2.0678296089172363, "learning_rate": 7.943369799624458e-07, "loss": 1.9786, "num_input_tokens_seen": 232469728, "step": 228300 }, { "epoch": 4.496859679864543, "grad_norm": 2.136457920074463, "learning_rate": 7.941702847082066e-07, "loss": 1.9877, "num_input_tokens_seen": 232569256, "step": 228400 }, { "epoch": 4.498828532614046, "grad_norm": 1.9161975383758545, "learning_rate": 7.940035394327918e-07, "loss": 1.9814, "num_input_tokens_seen": 232670504, "step": 228500 }, { "epoch": 4.500797385363549, "grad_norm": 1.863601803779602, "learning_rate": 7.938367441645548e-07, "loss": 1.9505, "num_input_tokens_seen": 232772904, "step": 228600 }, { "epoch": 4.502766238113051, "grad_norm": 1.7487550973892212, "learning_rate": 7.936698989318579e-07, "loss": 1.9631, "num_input_tokens_seen": 232875304, "step": 228700 }, { "epoch": 4.504735090862554, "grad_norm": 1.996494174003601, "learning_rate": 7.935030037630715e-07, "loss": 2.028, "num_input_tokens_seen": 232977704, "step": 228800 }, { "epoch": 4.506703943612058, "grad_norm": 1.7284367084503174, "learning_rate": 7.93336058686575e-07, "loss": 2.0266, "num_input_tokens_seen": 233079184, "step": 228900 }, { "epoch": 4.5086727963615605, "grad_norm": 1.7391117811203003, "learning_rate": 7.931690637307556e-07, "loss": 2.0386, "num_input_tokens_seen": 233181168, "step": 229000 }, { "epoch": 4.510641649111063, "grad_norm": 1.9778952598571777, "learning_rate": 7.930020189240098e-07, "loss": 2.0128, "num_input_tokens_seen": 233283568, "step": 229100 }, { "epoch": 4.512610501860566, "grad_norm": 1.9555339813232422, "learning_rate": 7.928349242947417e-07, "loss": 1.9569, "num_input_tokens_seen": 233385224, "step": 229200 }, { "epoch": 4.514579354610069, "grad_norm": 1.83670973777771, "learning_rate": 7.926677798713648e-07, "loss": 1.9724, "num_input_tokens_seen": 233487624, "step": 229300 }, { "epoch": 4.5165482073595715, "grad_norm": 1.8802381753921509, "learning_rate": 7.925005856823005e-07, "loss": 1.9751, "num_input_tokens_seen": 233590024, "step": 229400 }, { "epoch": 4.518517060109074, "grad_norm": 2.3416171073913574, "learning_rate": 7.923333417559785e-07, "loss": 1.9788, "num_input_tokens_seen": 233690840, "step": 229500 }, { "epoch": 4.520485912858577, "grad_norm": 1.822119951248169, "learning_rate": 7.921660481208375e-07, "loss": 2.0116, "num_input_tokens_seen": 233793240, "step": 229600 }, { "epoch": 4.52245476560808, "grad_norm": 2.103149890899658, "learning_rate": 7.919987048053243e-07, "loss": 2.0102, "num_input_tokens_seen": 233894824, "step": 229700 }, { "epoch": 4.5244236183575826, "grad_norm": 1.893359899520874, "learning_rate": 7.91831311837894e-07, "loss": 2.0355, "num_input_tokens_seen": 233996416, "step": 229800 }, { "epoch": 4.526392471107086, "grad_norm": 1.7088468074798584, "learning_rate": 7.916638692470107e-07, "loss": 2.0399, "num_input_tokens_seen": 234098728, "step": 229900 }, { "epoch": 4.528361323856589, "grad_norm": 1.9777233600616455, "learning_rate": 7.914963770611464e-07, "loss": 1.9528, "num_input_tokens_seen": 234200336, "step": 230000 }, { "epoch": 4.530330176606092, "grad_norm": 1.9436982870101929, "learning_rate": 7.913288353087817e-07, "loss": 2.0033, "num_input_tokens_seen": 234302736, "step": 230100 }, { "epoch": 4.5322990293555945, "grad_norm": 2.070924997329712, "learning_rate": 7.911612440184057e-07, "loss": 1.9786, "num_input_tokens_seen": 234403176, "step": 230200 }, { "epoch": 4.534267882105097, "grad_norm": 2.2279651165008545, "learning_rate": 7.909936032185161e-07, "loss": 2.056, "num_input_tokens_seen": 234505000, "step": 230300 }, { "epoch": 4.5362367348546, "grad_norm": 2.046231269836426, "learning_rate": 7.908259129376185e-07, "loss": 1.985, "num_input_tokens_seen": 234605928, "step": 230400 }, { "epoch": 4.538205587604103, "grad_norm": 1.8107647895812988, "learning_rate": 7.906581732042275e-07, "loss": 1.9942, "num_input_tokens_seen": 234707848, "step": 230500 }, { "epoch": 4.540174440353606, "grad_norm": 1.6536964178085327, "learning_rate": 7.904903840468655e-07, "loss": 1.9839, "num_input_tokens_seen": 234809720, "step": 230600 }, { "epoch": 4.542143293103109, "grad_norm": 1.911716103553772, "learning_rate": 7.903225454940639e-07, "loss": 1.955, "num_input_tokens_seen": 234911384, "step": 230700 }, { "epoch": 4.544112145852612, "grad_norm": 1.9053138494491577, "learning_rate": 7.901546575743621e-07, "loss": 2.0247, "num_input_tokens_seen": 235013416, "step": 230800 }, { "epoch": 4.546080998602115, "grad_norm": 1.8903558254241943, "learning_rate": 7.89986720316308e-07, "loss": 1.9646, "num_input_tokens_seen": 235115008, "step": 230900 }, { "epoch": 4.548049851351617, "grad_norm": 1.9929543733596802, "learning_rate": 7.89818733748458e-07, "loss": 1.961, "num_input_tokens_seen": 235217408, "step": 231000 }, { "epoch": 4.55001870410112, "grad_norm": 1.9232136011123657, "learning_rate": 7.896506978993767e-07, "loss": 2.0015, "num_input_tokens_seen": 235319256, "step": 231100 }, { "epoch": 4.551987556850623, "grad_norm": 1.882171392440796, "learning_rate": 7.894826127976374e-07, "loss": 2.015, "num_input_tokens_seen": 235420928, "step": 231200 }, { "epoch": 4.553956409600126, "grad_norm": 2.100560426712036, "learning_rate": 7.893144784718213e-07, "loss": 1.997, "num_input_tokens_seen": 235522856, "step": 231300 }, { "epoch": 4.5559252623496285, "grad_norm": 1.7758921384811401, "learning_rate": 7.891462949505185e-07, "loss": 1.9465, "num_input_tokens_seen": 235625256, "step": 231400 }, { "epoch": 4.557894115099132, "grad_norm": 1.914080023765564, "learning_rate": 7.889780622623268e-07, "loss": 2.0049, "num_input_tokens_seen": 235725664, "step": 231500 }, { "epoch": 4.559862967848635, "grad_norm": 1.8459874391555786, "learning_rate": 7.888097804358531e-07, "loss": 2.0241, "num_input_tokens_seen": 235826840, "step": 231600 }, { "epoch": 4.561831820598138, "grad_norm": 1.9404900074005127, "learning_rate": 7.88641449499712e-07, "loss": 1.9947, "num_input_tokens_seen": 235927976, "step": 231700 }, { "epoch": 4.56380067334764, "grad_norm": 2.0413923263549805, "learning_rate": 7.884730694825271e-07, "loss": 2.003, "num_input_tokens_seen": 236029696, "step": 231800 }, { "epoch": 4.565769526097143, "grad_norm": 1.9381991624832153, "learning_rate": 7.883046404129299e-07, "loss": 1.9189, "num_input_tokens_seen": 236132096, "step": 231900 }, { "epoch": 4.567738378846646, "grad_norm": 1.8498059511184692, "learning_rate": 7.8813616231956e-07, "loss": 1.9752, "num_input_tokens_seen": 236234496, "step": 232000 }, { "epoch": 4.569707231596149, "grad_norm": 1.8928567171096802, "learning_rate": 7.879676352310662e-07, "loss": 1.9923, "num_input_tokens_seen": 236336752, "step": 232100 }, { "epoch": 4.571676084345651, "grad_norm": 2.515831470489502, "learning_rate": 7.877990591761046e-07, "loss": 1.9548, "num_input_tokens_seen": 236439152, "step": 232200 }, { "epoch": 4.573644937095155, "grad_norm": 1.9485191106796265, "learning_rate": 7.876304341833407e-07, "loss": 1.9822, "num_input_tokens_seen": 236540784, "step": 232300 }, { "epoch": 4.575613789844658, "grad_norm": 1.871766448020935, "learning_rate": 7.874617602814472e-07, "loss": 1.9805, "num_input_tokens_seen": 236642752, "step": 232400 }, { "epoch": 4.577582642594161, "grad_norm": 1.8358638286590576, "learning_rate": 7.87293037499106e-07, "loss": 2.002, "num_input_tokens_seen": 236745152, "step": 232500 }, { "epoch": 4.579551495343663, "grad_norm": 2.0074095726013184, "learning_rate": 7.87124265865007e-07, "loss": 1.9208, "num_input_tokens_seen": 236846792, "step": 232600 }, { "epoch": 4.581520348093166, "grad_norm": 1.966914176940918, "learning_rate": 7.869554454078482e-07, "loss": 1.9783, "num_input_tokens_seen": 236949192, "step": 232700 }, { "epoch": 4.583489200842669, "grad_norm": 1.9215971231460571, "learning_rate": 7.867865761563364e-07, "loss": 1.9966, "num_input_tokens_seen": 237051000, "step": 232800 }, { "epoch": 4.585458053592172, "grad_norm": 2.2058589458465576, "learning_rate": 7.866176581391861e-07, "loss": 1.9756, "num_input_tokens_seen": 237152552, "step": 232900 }, { "epoch": 4.587426906341674, "grad_norm": 1.9091542959213257, "learning_rate": 7.864486913851203e-07, "loss": 2.0251, "num_input_tokens_seen": 237253944, "step": 233000 }, { "epoch": 4.589395759091177, "grad_norm": 2.0750956535339355, "learning_rate": 7.862796759228707e-07, "loss": 1.9849, "num_input_tokens_seen": 237355384, "step": 233100 }, { "epoch": 4.591364611840681, "grad_norm": 1.8790674209594727, "learning_rate": 7.861106117811767e-07, "loss": 1.9724, "num_input_tokens_seen": 237456600, "step": 233200 }, { "epoch": 4.5933334645901835, "grad_norm": 1.7590235471725464, "learning_rate": 7.859414989887865e-07, "loss": 1.9423, "num_input_tokens_seen": 237558360, "step": 233300 }, { "epoch": 4.595302317339686, "grad_norm": 2.190117835998535, "learning_rate": 7.85772337574456e-07, "loss": 2.0129, "num_input_tokens_seen": 237660760, "step": 233400 }, { "epoch": 4.597271170089189, "grad_norm": 1.9098066091537476, "learning_rate": 7.856031275669499e-07, "loss": 1.9746, "num_input_tokens_seen": 237761864, "step": 233500 }, { "epoch": 4.599240022838692, "grad_norm": 2.2540605068206787, "learning_rate": 7.854338689950408e-07, "loss": 1.9703, "num_input_tokens_seen": 237862968, "step": 233600 }, { "epoch": 4.601208875588195, "grad_norm": 2.0733883380889893, "learning_rate": 7.852645618875098e-07, "loss": 1.9833, "num_input_tokens_seen": 237964040, "step": 233700 }, { "epoch": 4.603177728337697, "grad_norm": 1.6968687772750854, "learning_rate": 7.85095206273146e-07, "loss": 2.0093, "num_input_tokens_seen": 238064352, "step": 233800 }, { "epoch": 4.605146581087201, "grad_norm": 1.6747245788574219, "learning_rate": 7.84925802180747e-07, "loss": 1.9902, "num_input_tokens_seen": 238165888, "step": 233900 }, { "epoch": 4.607115433836704, "grad_norm": 2.083350658416748, "learning_rate": 7.847563496391185e-07, "loss": 1.9852, "num_input_tokens_seen": 238268288, "step": 234000 }, { "epoch": 4.6090842865862065, "grad_norm": 1.9042161703109741, "learning_rate": 7.845868486770746e-07, "loss": 2.0179, "num_input_tokens_seen": 238370688, "step": 234100 }, { "epoch": 4.611053139335709, "grad_norm": 1.9082952737808228, "learning_rate": 7.844172993234372e-07, "loss": 1.9391, "num_input_tokens_seen": 238472456, "step": 234200 }, { "epoch": 4.613021992085212, "grad_norm": 1.8290693759918213, "learning_rate": 7.84247701607037e-07, "loss": 1.9625, "num_input_tokens_seen": 238574856, "step": 234300 }, { "epoch": 4.614990844834715, "grad_norm": 1.919039011001587, "learning_rate": 7.840780555567128e-07, "loss": 1.9387, "num_input_tokens_seen": 238676608, "step": 234400 }, { "epoch": 4.6169596975842175, "grad_norm": 2.044429302215576, "learning_rate": 7.839083612013111e-07, "loss": 1.9293, "num_input_tokens_seen": 238779008, "step": 234500 }, { "epoch": 4.61892855033372, "grad_norm": 1.6604371070861816, "learning_rate": 7.837386185696872e-07, "loss": 1.9873, "num_input_tokens_seen": 238880936, "step": 234600 }, { "epoch": 4.620897403083223, "grad_norm": 1.8480556011199951, "learning_rate": 7.835688276907044e-07, "loss": 1.9705, "num_input_tokens_seen": 238983336, "step": 234700 }, { "epoch": 4.622866255832726, "grad_norm": 1.7892968654632568, "learning_rate": 7.833989885932342e-07, "loss": 2.0523, "num_input_tokens_seen": 239084888, "step": 234800 }, { "epoch": 4.6248351085822295, "grad_norm": 1.9779351949691772, "learning_rate": 7.832291013061561e-07, "loss": 1.9507, "num_input_tokens_seen": 239186672, "step": 234900 }, { "epoch": 4.626803961331732, "grad_norm": 2.0236270427703857, "learning_rate": 7.830591658583584e-07, "loss": 1.9871, "num_input_tokens_seen": 239288384, "step": 235000 }, { "epoch": 4.628772814081235, "grad_norm": 1.9210270643234253, "learning_rate": 7.828891822787369e-07, "loss": 2.0223, "num_input_tokens_seen": 239389464, "step": 235100 }, { "epoch": 4.630741666830738, "grad_norm": 1.7915340662002563, "learning_rate": 7.827191505961959e-07, "loss": 1.9643, "num_input_tokens_seen": 239491864, "step": 235200 }, { "epoch": 4.6327105195802405, "grad_norm": 1.9686694145202637, "learning_rate": 7.82549070839648e-07, "loss": 2.0114, "num_input_tokens_seen": 239594264, "step": 235300 }, { "epoch": 4.634679372329743, "grad_norm": 2.0891788005828857, "learning_rate": 7.823789430380136e-07, "loss": 1.9559, "num_input_tokens_seen": 239696664, "step": 235400 }, { "epoch": 4.636648225079246, "grad_norm": 1.9873641729354858, "learning_rate": 7.822087672202217e-07, "loss": 1.9618, "num_input_tokens_seen": 239798208, "step": 235500 }, { "epoch": 4.63861707782875, "grad_norm": 1.9363912343978882, "learning_rate": 7.820385434152093e-07, "loss": 1.9937, "num_input_tokens_seen": 239899792, "step": 235600 }, { "epoch": 4.640585930578252, "grad_norm": 2.0016725063323975, "learning_rate": 7.818682716519214e-07, "loss": 2.0103, "num_input_tokens_seen": 240000832, "step": 235700 }, { "epoch": 4.642554783327755, "grad_norm": 1.9424642324447632, "learning_rate": 7.816979519593112e-07, "loss": 1.9981, "num_input_tokens_seen": 240103232, "step": 235800 }, { "epoch": 4.644523636077258, "grad_norm": 1.9264036417007446, "learning_rate": 7.815275843663402e-07, "loss": 2.0369, "num_input_tokens_seen": 240204352, "step": 235900 }, { "epoch": 4.646492488826761, "grad_norm": 1.7386269569396973, "learning_rate": 7.813571689019781e-07, "loss": 2.0238, "num_input_tokens_seen": 240306112, "step": 236000 }, { "epoch": 4.6484613415762635, "grad_norm": 1.8147231340408325, "learning_rate": 7.811867055952024e-07, "loss": 2.0585, "num_input_tokens_seen": 240407656, "step": 236100 }, { "epoch": 4.650430194325766, "grad_norm": 1.8757692575454712, "learning_rate": 7.810161944749991e-07, "loss": 1.9717, "num_input_tokens_seen": 240510056, "step": 236200 }, { "epoch": 4.652399047075269, "grad_norm": 1.6022590398788452, "learning_rate": 7.808456355703621e-07, "loss": 2.0203, "num_input_tokens_seen": 240610992, "step": 236300 }, { "epoch": 4.654367899824772, "grad_norm": 4.077651023864746, "learning_rate": 7.806750289102935e-07, "loss": 1.9609, "num_input_tokens_seen": 240712640, "step": 236400 }, { "epoch": 4.6563367525742745, "grad_norm": 2.067807197570801, "learning_rate": 7.805043745238037e-07, "loss": 1.9732, "num_input_tokens_seen": 240815040, "step": 236500 }, { "epoch": 4.658305605323778, "grad_norm": 2.07737135887146, "learning_rate": 7.803336724399107e-07, "loss": 2.0, "num_input_tokens_seen": 240916976, "step": 236600 }, { "epoch": 4.660274458073281, "grad_norm": 2.0691514015197754, "learning_rate": 7.801629226876414e-07, "loss": 1.9896, "num_input_tokens_seen": 241018552, "step": 236700 }, { "epoch": 4.662243310822784, "grad_norm": 2.2837865352630615, "learning_rate": 7.799921252960299e-07, "loss": 1.9879, "num_input_tokens_seen": 241120952, "step": 236800 }, { "epoch": 4.664212163572286, "grad_norm": 1.9081482887268066, "learning_rate": 7.798212802941191e-07, "loss": 2.0108, "num_input_tokens_seen": 241223280, "step": 236900 }, { "epoch": 4.666181016321789, "grad_norm": 2.322063446044922, "learning_rate": 7.796503877109597e-07, "loss": 1.9737, "num_input_tokens_seen": 241325680, "step": 237000 }, { "epoch": 4.668149869071292, "grad_norm": 1.907496452331543, "learning_rate": 7.794794475756106e-07, "loss": 1.9647, "num_input_tokens_seen": 241427512, "step": 237100 }, { "epoch": 4.670118721820795, "grad_norm": 1.9791024923324585, "learning_rate": 7.793084599171387e-07, "loss": 1.9795, "num_input_tokens_seen": 241529912, "step": 237200 }, { "epoch": 4.672087574570298, "grad_norm": 1.9470134973526, "learning_rate": 7.791374247646188e-07, "loss": 1.933, "num_input_tokens_seen": 241631400, "step": 237300 }, { "epoch": 4.674056427319801, "grad_norm": 1.9613935947418213, "learning_rate": 7.789663421471343e-07, "loss": 1.9493, "num_input_tokens_seen": 241733800, "step": 237400 }, { "epoch": 4.676025280069304, "grad_norm": 2.3004748821258545, "learning_rate": 7.787952120937761e-07, "loss": 1.9725, "num_input_tokens_seen": 241835624, "step": 237500 }, { "epoch": 4.677994132818807, "grad_norm": 1.9286905527114868, "learning_rate": 7.78624034633644e-07, "loss": 2.0052, "num_input_tokens_seen": 241937448, "step": 237600 }, { "epoch": 4.679962985568309, "grad_norm": 1.8778142929077148, "learning_rate": 7.784528097958444e-07, "loss": 1.9878, "num_input_tokens_seen": 242038424, "step": 237700 }, { "epoch": 4.681931838317812, "grad_norm": 1.7701457738876343, "learning_rate": 7.782815376094931e-07, "loss": 1.9773, "num_input_tokens_seen": 242139376, "step": 237800 }, { "epoch": 4.683900691067315, "grad_norm": 1.7408747673034668, "learning_rate": 7.781102181037136e-07, "loss": 2.0059, "num_input_tokens_seen": 242241136, "step": 237900 }, { "epoch": 4.685869543816818, "grad_norm": 1.8353618383407593, "learning_rate": 7.779388513076373e-07, "loss": 1.9885, "num_input_tokens_seen": 242342904, "step": 238000 }, { "epoch": 4.68783839656632, "grad_norm": 1.8905603885650635, "learning_rate": 7.777674372504033e-07, "loss": 1.9919, "num_input_tokens_seen": 242443632, "step": 238100 }, { "epoch": 4.689807249315824, "grad_norm": 1.9271143674850464, "learning_rate": 7.775959759611595e-07, "loss": 1.9468, "num_input_tokens_seen": 242544712, "step": 238200 }, { "epoch": 4.691776102065327, "grad_norm": 2.4840543270111084, "learning_rate": 7.774244674690613e-07, "loss": 1.9898, "num_input_tokens_seen": 242646256, "step": 238300 }, { "epoch": 4.69374495481483, "grad_norm": 1.6891206502914429, "learning_rate": 7.772529118032725e-07, "loss": 1.9578, "num_input_tokens_seen": 242748128, "step": 238400 }, { "epoch": 4.695713807564332, "grad_norm": 1.9536232948303223, "learning_rate": 7.770813089929643e-07, "loss": 1.9804, "num_input_tokens_seen": 242850528, "step": 238500 }, { "epoch": 4.697682660313835, "grad_norm": 2.0244576930999756, "learning_rate": 7.769096590673167e-07, "loss": 1.9641, "num_input_tokens_seen": 242951256, "step": 238600 }, { "epoch": 4.699651513063338, "grad_norm": 1.9010989665985107, "learning_rate": 7.76737962055517e-07, "loss": 1.9761, "num_input_tokens_seen": 243053656, "step": 238700 }, { "epoch": 4.701620365812841, "grad_norm": 2.0288681983947754, "learning_rate": 7.76566217986761e-07, "loss": 1.9956, "num_input_tokens_seen": 243156056, "step": 238800 }, { "epoch": 4.703589218562343, "grad_norm": 2.0047388076782227, "learning_rate": 7.763944268902524e-07, "loss": 1.955, "num_input_tokens_seen": 243258456, "step": 238900 }, { "epoch": 4.705558071311847, "grad_norm": 7.127472877502441, "learning_rate": 7.762225887952025e-07, "loss": 2.0129, "num_input_tokens_seen": 243360856, "step": 239000 }, { "epoch": 4.70752692406135, "grad_norm": 2.2163915634155273, "learning_rate": 7.760507037308313e-07, "loss": 2.0004, "num_input_tokens_seen": 243463256, "step": 239100 }, { "epoch": 4.7094957768108525, "grad_norm": 1.742059350013733, "learning_rate": 7.758787717263662e-07, "loss": 2.0329, "num_input_tokens_seen": 243564056, "step": 239200 }, { "epoch": 4.711464629560355, "grad_norm": 1.989970088005066, "learning_rate": 7.757067928110431e-07, "loss": 1.9915, "num_input_tokens_seen": 243665560, "step": 239300 }, { "epoch": 4.713433482309858, "grad_norm": 1.9462517499923706, "learning_rate": 7.755347670141049e-07, "loss": 1.9581, "num_input_tokens_seen": 243767960, "step": 239400 }, { "epoch": 4.715402335059361, "grad_norm": 1.9584484100341797, "learning_rate": 7.753626943648038e-07, "loss": 1.9533, "num_input_tokens_seen": 243869880, "step": 239500 }, { "epoch": 4.7173711878088636, "grad_norm": 1.6583120822906494, "learning_rate": 7.751905748923989e-07, "loss": 2.0057, "num_input_tokens_seen": 243971688, "step": 239600 }, { "epoch": 4.719340040558366, "grad_norm": 2.032684087753296, "learning_rate": 7.750184086261578e-07, "loss": 1.99, "num_input_tokens_seen": 244074088, "step": 239700 }, { "epoch": 4.721308893307869, "grad_norm": 1.8053139448165894, "learning_rate": 7.74846195595356e-07, "loss": 1.979, "num_input_tokens_seen": 244176488, "step": 239800 }, { "epoch": 4.723277746057373, "grad_norm": 2.0863256454467773, "learning_rate": 7.746739358292769e-07, "loss": 1.9828, "num_input_tokens_seen": 244277368, "step": 239900 }, { "epoch": 4.7252465988068755, "grad_norm": 1.8091161251068115, "learning_rate": 7.745016293572115e-07, "loss": 1.965, "num_input_tokens_seen": 244379768, "step": 240000 }, { "epoch": 4.727215451556378, "grad_norm": 1.9357178211212158, "learning_rate": 7.743292762084594e-07, "loss": 1.9679, "num_input_tokens_seen": 244482168, "step": 240100 }, { "epoch": 4.729184304305881, "grad_norm": 5.501689434051514, "learning_rate": 7.741568764123276e-07, "loss": 2.0063, "num_input_tokens_seen": 244583776, "step": 240200 }, { "epoch": 4.731153157055384, "grad_norm": 1.9443832635879517, "learning_rate": 7.739844299981315e-07, "loss": 1.9971, "num_input_tokens_seen": 244684488, "step": 240300 }, { "epoch": 4.7331220098048865, "grad_norm": 1.8885704278945923, "learning_rate": 7.738119369951936e-07, "loss": 1.9554, "num_input_tokens_seen": 244784608, "step": 240400 }, { "epoch": 4.735090862554389, "grad_norm": 2.113469362258911, "learning_rate": 7.736393974328456e-07, "loss": 1.9819, "num_input_tokens_seen": 244887008, "step": 240500 }, { "epoch": 4.737059715303893, "grad_norm": 1.9098161458969116, "learning_rate": 7.734668113404259e-07, "loss": 2.0177, "num_input_tokens_seen": 244988952, "step": 240600 }, { "epoch": 4.739028568053396, "grad_norm": 1.8703964948654175, "learning_rate": 7.732941787472815e-07, "loss": 1.983, "num_input_tokens_seen": 245090336, "step": 240700 }, { "epoch": 4.740997420802898, "grad_norm": 1.9677131175994873, "learning_rate": 7.731214996827671e-07, "loss": 1.993, "num_input_tokens_seen": 245191096, "step": 240800 }, { "epoch": 4.742966273552401, "grad_norm": 1.7599327564239502, "learning_rate": 7.729487741762453e-07, "loss": 2.0268, "num_input_tokens_seen": 245293496, "step": 240900 }, { "epoch": 4.744935126301904, "grad_norm": 1.8329025506973267, "learning_rate": 7.727760022570866e-07, "loss": 1.9771, "num_input_tokens_seen": 245394472, "step": 241000 }, { "epoch": 4.746903979051407, "grad_norm": 1.8511635065078735, "learning_rate": 7.726031839546694e-07, "loss": 1.9415, "num_input_tokens_seen": 245496872, "step": 241100 }, { "epoch": 4.7488728318009095, "grad_norm": 1.6652101278305054, "learning_rate": 7.724303192983798e-07, "loss": 1.9613, "num_input_tokens_seen": 245598696, "step": 241200 }, { "epoch": 4.750841684550412, "grad_norm": 2.142704725265503, "learning_rate": 7.722574083176126e-07, "loss": 1.9493, "num_input_tokens_seen": 245700520, "step": 241300 }, { "epoch": 4.752810537299915, "grad_norm": 1.9635742902755737, "learning_rate": 7.72084451041769e-07, "loss": 1.9343, "num_input_tokens_seen": 245802376, "step": 241400 }, { "epoch": 4.754779390049418, "grad_norm": 1.9217370748519897, "learning_rate": 7.719114475002598e-07, "loss": 1.9815, "num_input_tokens_seen": 245903904, "step": 241500 }, { "epoch": 4.756748242798921, "grad_norm": 1.9156200885772705, "learning_rate": 7.717383977225021e-07, "loss": 1.959, "num_input_tokens_seen": 246006304, "step": 241600 }, { "epoch": 4.758717095548424, "grad_norm": 2.073235273361206, "learning_rate": 7.715653017379217e-07, "loss": 1.9583, "num_input_tokens_seen": 246108040, "step": 241700 }, { "epoch": 4.760685948297927, "grad_norm": 1.8034470081329346, "learning_rate": 7.713921595759525e-07, "loss": 2.0054, "num_input_tokens_seen": 246209776, "step": 241800 }, { "epoch": 4.76265480104743, "grad_norm": 1.9783844947814941, "learning_rate": 7.712189712660355e-07, "loss": 2.0034, "num_input_tokens_seen": 246311672, "step": 241900 }, { "epoch": 4.764623653796932, "grad_norm": 1.8855631351470947, "learning_rate": 7.7104573683762e-07, "loss": 1.9775, "num_input_tokens_seen": 246413528, "step": 242000 }, { "epoch": 4.766592506546435, "grad_norm": 1.9352155923843384, "learning_rate": 7.708724563201629e-07, "loss": 2.0206, "num_input_tokens_seen": 246515928, "step": 242100 }, { "epoch": 4.768561359295938, "grad_norm": 1.8944944143295288, "learning_rate": 7.706991297431294e-07, "loss": 2.0112, "num_input_tokens_seen": 246616336, "step": 242200 }, { "epoch": 4.770530212045442, "grad_norm": 1.7942332029342651, "learning_rate": 7.705257571359921e-07, "loss": 1.9552, "num_input_tokens_seen": 246718736, "step": 242300 }, { "epoch": 4.772499064794944, "grad_norm": 2.022024631500244, "learning_rate": 7.703523385282314e-07, "loss": 1.9995, "num_input_tokens_seen": 246820328, "step": 242400 }, { "epoch": 4.774467917544447, "grad_norm": 1.8557579517364502, "learning_rate": 7.70178873949336e-07, "loss": 1.9152, "num_input_tokens_seen": 246921128, "step": 242500 }, { "epoch": 4.77643677029395, "grad_norm": 1.7127655744552612, "learning_rate": 7.700053634288018e-07, "loss": 2.0022, "num_input_tokens_seen": 247022976, "step": 242600 }, { "epoch": 4.778405623043453, "grad_norm": 2.2102367877960205, "learning_rate": 7.698318069961327e-07, "loss": 1.9786, "num_input_tokens_seen": 247125376, "step": 242700 }, { "epoch": 4.780374475792955, "grad_norm": 1.8471670150756836, "learning_rate": 7.696582046808409e-07, "loss": 2.0114, "num_input_tokens_seen": 247227224, "step": 242800 }, { "epoch": 4.782343328542458, "grad_norm": 2.032442808151245, "learning_rate": 7.694845565124456e-07, "loss": 1.9674, "num_input_tokens_seen": 247328480, "step": 242900 }, { "epoch": 4.784312181291961, "grad_norm": 1.9215418100357056, "learning_rate": 7.693108625204745e-07, "loss": 1.9807, "num_input_tokens_seen": 247430120, "step": 243000 }, { "epoch": 4.786281034041464, "grad_norm": 1.954582929611206, "learning_rate": 7.691371227344628e-07, "loss": 2.0155, "num_input_tokens_seen": 247530976, "step": 243100 }, { "epoch": 4.788249886790966, "grad_norm": 1.8124114274978638, "learning_rate": 7.689633371839533e-07, "loss": 1.9671, "num_input_tokens_seen": 247631784, "step": 243200 }, { "epoch": 4.79021873954047, "grad_norm": 1.6148899793624878, "learning_rate": 7.687895058984968e-07, "loss": 1.9918, "num_input_tokens_seen": 247732560, "step": 243300 }, { "epoch": 4.792187592289973, "grad_norm": 1.8852741718292236, "learning_rate": 7.686156289076519e-07, "loss": 2.0244, "num_input_tokens_seen": 247834960, "step": 243400 }, { "epoch": 4.794156445039476, "grad_norm": 2.124105930328369, "learning_rate": 7.684417062409849e-07, "loss": 1.9407, "num_input_tokens_seen": 247936560, "step": 243500 }, { "epoch": 4.796125297788978, "grad_norm": 2.2060279846191406, "learning_rate": 7.682677379280699e-07, "loss": 1.9684, "num_input_tokens_seen": 248038960, "step": 243600 }, { "epoch": 4.798094150538481, "grad_norm": 1.9046642780303955, "learning_rate": 7.680937239984888e-07, "loss": 1.9862, "num_input_tokens_seen": 248139920, "step": 243700 }, { "epoch": 4.800063003287984, "grad_norm": 1.8380968570709229, "learning_rate": 7.679196644818313e-07, "loss": 2.0188, "num_input_tokens_seen": 248242320, "step": 243800 }, { "epoch": 4.802031856037487, "grad_norm": 1.8512117862701416, "learning_rate": 7.677455594076943e-07, "loss": 1.9799, "num_input_tokens_seen": 248344720, "step": 243900 }, { "epoch": 4.80400070878699, "grad_norm": 1.8946309089660645, "learning_rate": 7.675714088056833e-07, "loss": 1.9191, "num_input_tokens_seen": 248446640, "step": 244000 }, { "epoch": 4.805969561536493, "grad_norm": 1.989884376525879, "learning_rate": 7.673972127054112e-07, "loss": 1.976, "num_input_tokens_seen": 248546576, "step": 244100 }, { "epoch": 4.807938414285996, "grad_norm": 2.0823934078216553, "learning_rate": 7.672229711364981e-07, "loss": 2.0169, "num_input_tokens_seen": 248648840, "step": 244200 }, { "epoch": 4.8099072670354985, "grad_norm": 2.1578969955444336, "learning_rate": 7.670486841285728e-07, "loss": 2.0082, "num_input_tokens_seen": 248751240, "step": 244300 }, { "epoch": 4.811876119785001, "grad_norm": 1.9351460933685303, "learning_rate": 7.668743517112713e-07, "loss": 2.003, "num_input_tokens_seen": 248852264, "step": 244400 }, { "epoch": 4.813844972534504, "grad_norm": 3.4517440795898438, "learning_rate": 7.66699973914237e-07, "loss": 1.9917, "num_input_tokens_seen": 248954360, "step": 244500 }, { "epoch": 4.815813825284007, "grad_norm": 1.9812885522842407, "learning_rate": 7.665255507671216e-07, "loss": 1.9517, "num_input_tokens_seen": 249056760, "step": 244600 }, { "epoch": 4.81778267803351, "grad_norm": 1.8989157676696777, "learning_rate": 7.663510822995842e-07, "loss": 1.9795, "num_input_tokens_seen": 249159160, "step": 244700 }, { "epoch": 4.819751530783012, "grad_norm": 2.909083843231201, "learning_rate": 7.661765685412919e-07, "loss": 2.0136, "num_input_tokens_seen": 249260792, "step": 244800 }, { "epoch": 4.821720383532516, "grad_norm": 2.0052597522735596, "learning_rate": 7.660020095219192e-07, "loss": 1.9627, "num_input_tokens_seen": 249362848, "step": 244900 }, { "epoch": 4.823689236282019, "grad_norm": 1.970702052116394, "learning_rate": 7.658274052711483e-07, "loss": 1.9917, "num_input_tokens_seen": 249464624, "step": 245000 }, { "epoch": 4.8256580890315215, "grad_norm": 1.9335062503814697, "learning_rate": 7.656527558186692e-07, "loss": 2.0013, "num_input_tokens_seen": 249566456, "step": 245100 }, { "epoch": 4.827626941781024, "grad_norm": 2.063624620437622, "learning_rate": 7.654780611941796e-07, "loss": 2.0469, "num_input_tokens_seen": 249667304, "step": 245200 }, { "epoch": 4.829595794530527, "grad_norm": 1.7945432662963867, "learning_rate": 7.653033214273849e-07, "loss": 1.9682, "num_input_tokens_seen": 249769704, "step": 245300 }, { "epoch": 4.83156464728003, "grad_norm": 3.2311344146728516, "learning_rate": 7.651285365479979e-07, "loss": 1.945, "num_input_tokens_seen": 249872104, "step": 245400 }, { "epoch": 4.8335335000295325, "grad_norm": 1.8495867252349854, "learning_rate": 7.649537065857397e-07, "loss": 1.9813, "num_input_tokens_seen": 249974504, "step": 245500 }, { "epoch": 4.835502352779035, "grad_norm": 1.802419900894165, "learning_rate": 7.647788315703381e-07, "loss": 1.9672, "num_input_tokens_seen": 250076056, "step": 245600 }, { "epoch": 4.837471205528539, "grad_norm": 1.8500380516052246, "learning_rate": 7.646039115315297e-07, "loss": 2.0243, "num_input_tokens_seen": 250178000, "step": 245700 }, { "epoch": 4.839440058278042, "grad_norm": 1.6966948509216309, "learning_rate": 7.644289464990578e-07, "loss": 1.9632, "num_input_tokens_seen": 250279832, "step": 245800 }, { "epoch": 4.8414089110275444, "grad_norm": 1.8331348896026611, "learning_rate": 7.642539365026739e-07, "loss": 1.9973, "num_input_tokens_seen": 250382232, "step": 245900 }, { "epoch": 4.843377763777047, "grad_norm": 1.5486363172531128, "learning_rate": 7.640788815721368e-07, "loss": 1.9749, "num_input_tokens_seen": 250484632, "step": 246000 }, { "epoch": 4.84534661652655, "grad_norm": 2.655693531036377, "learning_rate": 7.639037817372135e-07, "loss": 2.0112, "num_input_tokens_seen": 250586272, "step": 246100 }, { "epoch": 4.847315469276053, "grad_norm": 2.0464234352111816, "learning_rate": 7.637286370276778e-07, "loss": 1.9555, "num_input_tokens_seen": 250688672, "step": 246200 }, { "epoch": 4.8492843220255555, "grad_norm": 1.990045428276062, "learning_rate": 7.635534474733118e-07, "loss": 2.0138, "num_input_tokens_seen": 250790488, "step": 246300 }, { "epoch": 4.851253174775058, "grad_norm": 1.757544755935669, "learning_rate": 7.633782131039048e-07, "loss": 1.9635, "num_input_tokens_seen": 250892368, "step": 246400 }, { "epoch": 4.853222027524561, "grad_norm": 2.010892152786255, "learning_rate": 7.632029339492543e-07, "loss": 1.9928, "num_input_tokens_seen": 250993536, "step": 246500 }, { "epoch": 4.855190880274065, "grad_norm": 2.1679794788360596, "learning_rate": 7.630276100391647e-07, "loss": 2.0663, "num_input_tokens_seen": 251093456, "step": 246600 }, { "epoch": 4.857159733023567, "grad_norm": 2.170764923095703, "learning_rate": 7.628522414034486e-07, "loss": 1.9865, "num_input_tokens_seen": 251195032, "step": 246700 }, { "epoch": 4.85912858577307, "grad_norm": 1.8072079420089722, "learning_rate": 7.626768280719258e-07, "loss": 1.9895, "num_input_tokens_seen": 251297432, "step": 246800 }, { "epoch": 4.861097438522573, "grad_norm": 2.074186325073242, "learning_rate": 7.62501370074424e-07, "loss": 2.0129, "num_input_tokens_seen": 251398832, "step": 246900 }, { "epoch": 4.863066291272076, "grad_norm": 1.8249096870422363, "learning_rate": 7.623258674407782e-07, "loss": 1.9659, "num_input_tokens_seen": 251501032, "step": 247000 }, { "epoch": 4.865035144021578, "grad_norm": 4.696745872497559, "learning_rate": 7.621503202008312e-07, "loss": 2.0045, "num_input_tokens_seen": 251602816, "step": 247100 }, { "epoch": 4.867003996771081, "grad_norm": 2.030505418777466, "learning_rate": 7.619747283844332e-07, "loss": 1.9664, "num_input_tokens_seen": 251705216, "step": 247200 }, { "epoch": 4.868972849520585, "grad_norm": 2.0291748046875, "learning_rate": 7.617990920214425e-07, "loss": 1.9633, "num_input_tokens_seen": 251807616, "step": 247300 }, { "epoch": 4.870941702270088, "grad_norm": 2.032618999481201, "learning_rate": 7.61623411141724e-07, "loss": 1.9673, "num_input_tokens_seen": 251910016, "step": 247400 }, { "epoch": 4.87291055501959, "grad_norm": 1.7918150424957275, "learning_rate": 7.614476857751513e-07, "loss": 1.9885, "num_input_tokens_seen": 252012416, "step": 247500 }, { "epoch": 4.874879407769093, "grad_norm": 1.6047450304031372, "learning_rate": 7.612719159516046e-07, "loss": 1.9819, "num_input_tokens_seen": 252114816, "step": 247600 }, { "epoch": 4.876848260518596, "grad_norm": 2.1791837215423584, "learning_rate": 7.610961017009725e-07, "loss": 2.0085, "num_input_tokens_seen": 252216488, "step": 247700 }, { "epoch": 4.878817113268099, "grad_norm": 2.147240161895752, "learning_rate": 7.609202430531503e-07, "loss": 1.997, "num_input_tokens_seen": 252318344, "step": 247800 }, { "epoch": 4.880785966017601, "grad_norm": 3.31123685836792, "learning_rate": 7.607443400380415e-07, "loss": 2.068, "num_input_tokens_seen": 252420184, "step": 247900 }, { "epoch": 4.882754818767104, "grad_norm": 2.0299837589263916, "learning_rate": 7.60568392685557e-07, "loss": 2.034, "num_input_tokens_seen": 252521064, "step": 248000 }, { "epoch": 4.884723671516607, "grad_norm": 1.8934005498886108, "learning_rate": 7.603924010256149e-07, "loss": 1.9944, "num_input_tokens_seen": 252622640, "step": 248100 }, { "epoch": 4.88669252426611, "grad_norm": 1.7607197761535645, "learning_rate": 7.602163650881415e-07, "loss": 1.9715, "num_input_tokens_seen": 252723392, "step": 248200 }, { "epoch": 4.888661377015613, "grad_norm": 1.6598211526870728, "learning_rate": 7.600402849030698e-07, "loss": 2.0128, "num_input_tokens_seen": 252825080, "step": 248300 }, { "epoch": 4.890630229765116, "grad_norm": 1.8595517873764038, "learning_rate": 7.598641605003408e-07, "loss": 1.994, "num_input_tokens_seen": 252926560, "step": 248400 }, { "epoch": 4.892599082514619, "grad_norm": 1.9030126333236694, "learning_rate": 7.596879919099033e-07, "loss": 1.9687, "num_input_tokens_seen": 253028960, "step": 248500 }, { "epoch": 4.894567935264122, "grad_norm": 2.1952364444732666, "learning_rate": 7.595117791617131e-07, "loss": 2.0137, "num_input_tokens_seen": 253130320, "step": 248600 }, { "epoch": 4.896536788013624, "grad_norm": 1.8906571865081787, "learning_rate": 7.593355222857336e-07, "loss": 1.9851, "num_input_tokens_seen": 253232720, "step": 248700 }, { "epoch": 4.898505640763127, "grad_norm": 2.167135000228882, "learning_rate": 7.591592213119359e-07, "loss": 1.9977, "num_input_tokens_seen": 253334480, "step": 248800 }, { "epoch": 4.90047449351263, "grad_norm": 1.8287575244903564, "learning_rate": 7.589828762702986e-07, "loss": 1.9797, "num_input_tokens_seen": 253436360, "step": 248900 }, { "epoch": 4.9024433462621335, "grad_norm": 1.7591739892959595, "learning_rate": 7.588064871908073e-07, "loss": 2.0528, "num_input_tokens_seen": 253535936, "step": 249000 }, { "epoch": 4.904412199011636, "grad_norm": 1.7703303098678589, "learning_rate": 7.586300541034559e-07, "loss": 1.9703, "num_input_tokens_seen": 253637656, "step": 249100 }, { "epoch": 4.906381051761139, "grad_norm": 2.017404317855835, "learning_rate": 7.584535770382451e-07, "loss": 1.9691, "num_input_tokens_seen": 253740056, "step": 249200 }, { "epoch": 4.908349904510642, "grad_norm": 2.018389940261841, "learning_rate": 7.582770560251835e-07, "loss": 1.9676, "num_input_tokens_seen": 253842456, "step": 249300 }, { "epoch": 4.9103187572601446, "grad_norm": 1.9108238220214844, "learning_rate": 7.581004910942869e-07, "loss": 1.9926, "num_input_tokens_seen": 253944440, "step": 249400 }, { "epoch": 4.912287610009647, "grad_norm": 1.774223804473877, "learning_rate": 7.579238822755787e-07, "loss": 1.9619, "num_input_tokens_seen": 254046840, "step": 249500 }, { "epoch": 4.91425646275915, "grad_norm": 1.94884192943573, "learning_rate": 7.577472295990896e-07, "loss": 1.9519, "num_input_tokens_seen": 254148752, "step": 249600 }, { "epoch": 4.916225315508653, "grad_norm": 1.7490347623825073, "learning_rate": 7.575705330948583e-07, "loss": 2.0006, "num_input_tokens_seen": 254250520, "step": 249700 }, { "epoch": 4.918194168258156, "grad_norm": 1.9168856143951416, "learning_rate": 7.573937927929302e-07, "loss": 1.9962, "num_input_tokens_seen": 254352920, "step": 249800 }, { "epoch": 4.920163021007658, "grad_norm": 1.9100327491760254, "learning_rate": 7.572170087233586e-07, "loss": 2.0098, "num_input_tokens_seen": 254454720, "step": 249900 }, { "epoch": 4.922131873757162, "grad_norm": 2.0106396675109863, "learning_rate": 7.570401809162042e-07, "loss": 1.9462, "num_input_tokens_seen": 254557120, "step": 250000 }, { "epoch": 4.924100726506665, "grad_norm": 1.8019355535507202, "learning_rate": 7.568633094015349e-07, "loss": 2.0181, "num_input_tokens_seen": 254658400, "step": 250100 }, { "epoch": 4.9260695792561675, "grad_norm": 2.2052829265594482, "learning_rate": 7.566863942094263e-07, "loss": 1.9828, "num_input_tokens_seen": 254760800, "step": 250200 }, { "epoch": 4.92803843200567, "grad_norm": 2.1917173862457275, "learning_rate": 7.565094353699614e-07, "loss": 1.9979, "num_input_tokens_seen": 254862552, "step": 250300 }, { "epoch": 4.930007284755173, "grad_norm": 1.8937517404556274, "learning_rate": 7.563324329132306e-07, "loss": 1.9988, "num_input_tokens_seen": 254964096, "step": 250400 }, { "epoch": 4.931976137504676, "grad_norm": 1.8392560482025146, "learning_rate": 7.561553868693313e-07, "loss": 1.984, "num_input_tokens_seen": 255066496, "step": 250500 }, { "epoch": 4.9339449902541785, "grad_norm": 1.7257837057113647, "learning_rate": 7.559782972683694e-07, "loss": 1.9893, "num_input_tokens_seen": 255168896, "step": 250600 }, { "epoch": 4.935913843003682, "grad_norm": 1.9757217168807983, "learning_rate": 7.558011641404568e-07, "loss": 2.0202, "num_input_tokens_seen": 255271000, "step": 250700 }, { "epoch": 4.937882695753185, "grad_norm": 2.108100652694702, "learning_rate": 7.556239875157139e-07, "loss": 2.0018, "num_input_tokens_seen": 255371784, "step": 250800 }, { "epoch": 4.939851548502688, "grad_norm": 2.2486793994903564, "learning_rate": 7.554467674242678e-07, "loss": 1.9993, "num_input_tokens_seen": 255474184, "step": 250900 }, { "epoch": 4.9418204012521905, "grad_norm": 2.750699520111084, "learning_rate": 7.552695038962535e-07, "loss": 1.9614, "num_input_tokens_seen": 255576584, "step": 251000 }, { "epoch": 4.943789254001693, "grad_norm": 1.7878025770187378, "learning_rate": 7.550921969618132e-07, "loss": 2.0229, "num_input_tokens_seen": 255678280, "step": 251100 }, { "epoch": 4.945758106751196, "grad_norm": 2.116180181503296, "learning_rate": 7.549148466510964e-07, "loss": 1.9928, "num_input_tokens_seen": 255780680, "step": 251200 }, { "epoch": 4.947726959500699, "grad_norm": 1.764121413230896, "learning_rate": 7.547374529942597e-07, "loss": 1.9626, "num_input_tokens_seen": 255883080, "step": 251300 }, { "epoch": 4.9496958122502015, "grad_norm": 2.0100488662719727, "learning_rate": 7.545600160214679e-07, "loss": 1.9804, "num_input_tokens_seen": 255984064, "step": 251400 }, { "epoch": 4.951664664999704, "grad_norm": 1.9737484455108643, "learning_rate": 7.543825357628924e-07, "loss": 1.9367, "num_input_tokens_seen": 256086464, "step": 251500 }, { "epoch": 4.953633517749208, "grad_norm": 1.7976274490356445, "learning_rate": 7.542050122487122e-07, "loss": 1.9902, "num_input_tokens_seen": 256188104, "step": 251600 }, { "epoch": 4.955602370498711, "grad_norm": 1.7301064729690552, "learning_rate": 7.540274455091136e-07, "loss": 1.9746, "num_input_tokens_seen": 256289736, "step": 251700 }, { "epoch": 4.957571223248213, "grad_norm": 1.71588933467865, "learning_rate": 7.538498355742907e-07, "loss": 1.9361, "num_input_tokens_seen": 256392136, "step": 251800 }, { "epoch": 4.959540075997716, "grad_norm": 7.013174533843994, "learning_rate": 7.536721824744443e-07, "loss": 2.0139, "num_input_tokens_seen": 256494176, "step": 251900 }, { "epoch": 4.961508928747219, "grad_norm": 2.029755115509033, "learning_rate": 7.534944862397827e-07, "loss": 2.0195, "num_input_tokens_seen": 256596032, "step": 252000 }, { "epoch": 4.963477781496722, "grad_norm": 1.9294846057891846, "learning_rate": 7.533167469005219e-07, "loss": 2.0266, "num_input_tokens_seen": 256694728, "step": 252100 }, { "epoch": 4.9654466342462245, "grad_norm": 2.0432777404785156, "learning_rate": 7.531389644868848e-07, "loss": 1.9878, "num_input_tokens_seen": 256796448, "step": 252200 }, { "epoch": 4.967415486995727, "grad_norm": 1.7484453916549683, "learning_rate": 7.529611390291019e-07, "loss": 1.9393, "num_input_tokens_seen": 256896400, "step": 252300 }, { "epoch": 4.969384339745231, "grad_norm": 1.9300607442855835, "learning_rate": 7.52783270557411e-07, "loss": 1.979, "num_input_tokens_seen": 256998800, "step": 252400 }, { "epoch": 4.971353192494734, "grad_norm": 2.1486616134643555, "learning_rate": 7.526053591020568e-07, "loss": 2.0132, "num_input_tokens_seen": 257100432, "step": 252500 }, { "epoch": 4.973322045244236, "grad_norm": 1.813297986984253, "learning_rate": 7.524274046932919e-07, "loss": 1.9673, "num_input_tokens_seen": 257202832, "step": 252600 }, { "epoch": 4.975290897993739, "grad_norm": 1.8517218828201294, "learning_rate": 7.52249407361376e-07, "loss": 2.0336, "num_input_tokens_seen": 257304424, "step": 252700 }, { "epoch": 4.977259750743242, "grad_norm": 3.19309139251709, "learning_rate": 7.520713671365759e-07, "loss": 2.0102, "num_input_tokens_seen": 257405552, "step": 252800 }, { "epoch": 4.979228603492745, "grad_norm": 2.1946089267730713, "learning_rate": 7.518932840491659e-07, "loss": 1.9739, "num_input_tokens_seen": 257507224, "step": 252900 }, { "epoch": 4.981197456242247, "grad_norm": 1.8575719594955444, "learning_rate": 7.517151581294275e-07, "loss": 1.9371, "num_input_tokens_seen": 257609624, "step": 253000 }, { "epoch": 4.98316630899175, "grad_norm": 1.8853888511657715, "learning_rate": 7.515369894076494e-07, "loss": 1.9715, "num_input_tokens_seen": 257709776, "step": 253100 }, { "epoch": 4.985135161741253, "grad_norm": 2.0849671363830566, "learning_rate": 7.513587779141279e-07, "loss": 1.9851, "num_input_tokens_seen": 257811656, "step": 253200 }, { "epoch": 4.987104014490757, "grad_norm": 1.8125005960464478, "learning_rate": 7.511805236791664e-07, "loss": 1.9891, "num_input_tokens_seen": 257912328, "step": 253300 }, { "epoch": 4.989072867240259, "grad_norm": 1.8635026216506958, "learning_rate": 7.510022267330754e-07, "loss": 1.9633, "num_input_tokens_seen": 258014728, "step": 253400 }, { "epoch": 4.991041719989762, "grad_norm": 1.9553800821304321, "learning_rate": 7.508238871061725e-07, "loss": 1.9748, "num_input_tokens_seen": 258115848, "step": 253500 }, { "epoch": 4.993010572739265, "grad_norm": 2.6224100589752197, "learning_rate": 7.506455048287834e-07, "loss": 2.0179, "num_input_tokens_seen": 258217816, "step": 253600 }, { "epoch": 4.994979425488768, "grad_norm": 1.7693684101104736, "learning_rate": 7.504670799312403e-07, "loss": 1.9841, "num_input_tokens_seen": 258319440, "step": 253700 }, { "epoch": 4.99694827823827, "grad_norm": 1.981896162033081, "learning_rate": 7.502886124438826e-07, "loss": 2.0438, "num_input_tokens_seen": 258421840, "step": 253800 }, { "epoch": 4.998917130987773, "grad_norm": 1.9829676151275635, "learning_rate": 7.501101023970575e-07, "loss": 1.9882, "num_input_tokens_seen": 258524240, "step": 253900 }, { "epoch": 5.000885983737276, "grad_norm": 2.067901134490967, "learning_rate": 7.499315498211189e-07, "loss": 1.9443, "num_input_tokens_seen": 258626640, "step": 254000 }, { "epoch": 5.0028548364867795, "grad_norm": 2.22636342048645, "learning_rate": 7.497529547464284e-07, "loss": 1.9898, "num_input_tokens_seen": 258728416, "step": 254100 }, { "epoch": 5.004823689236282, "grad_norm": 1.7298223972320557, "learning_rate": 7.495743172033544e-07, "loss": 2.0092, "num_input_tokens_seen": 258830384, "step": 254200 }, { "epoch": 5.006792541985785, "grad_norm": 2.0167651176452637, "learning_rate": 7.493956372222728e-07, "loss": 2.0209, "num_input_tokens_seen": 258931456, "step": 254300 }, { "epoch": 5.008761394735288, "grad_norm": 1.8186672925949097, "learning_rate": 7.492169148335665e-07, "loss": 1.9968, "num_input_tokens_seen": 259032920, "step": 254400 }, { "epoch": 5.010730247484791, "grad_norm": 2.0461297035217285, "learning_rate": 7.49038150067626e-07, "loss": 2.0187, "num_input_tokens_seen": 259133080, "step": 254500 }, { "epoch": 5.012699100234293, "grad_norm": 2.007765769958496, "learning_rate": 7.488593429548484e-07, "loss": 1.9514, "num_input_tokens_seen": 259235480, "step": 254600 }, { "epoch": 5.014667952983796, "grad_norm": 1.9249708652496338, "learning_rate": 7.486804935256386e-07, "loss": 1.9713, "num_input_tokens_seen": 259337880, "step": 254700 }, { "epoch": 5.016636805733299, "grad_norm": 2.003084421157837, "learning_rate": 7.485016018104082e-07, "loss": 2.0099, "num_input_tokens_seen": 259440280, "step": 254800 }, { "epoch": 5.0186056584828025, "grad_norm": 1.9281221628189087, "learning_rate": 7.483226678395767e-07, "loss": 1.986, "num_input_tokens_seen": 259542000, "step": 254900 }, { "epoch": 5.020574511232305, "grad_norm": 9.370895385742188, "learning_rate": 7.481436916435696e-07, "loss": 1.9969, "num_input_tokens_seen": 259642976, "step": 255000 }, { "epoch": 5.022543363981808, "grad_norm": 1.9999701976776123, "learning_rate": 7.47964673252821e-07, "loss": 1.9783, "num_input_tokens_seen": 259743568, "step": 255100 }, { "epoch": 5.024512216731311, "grad_norm": 1.772251009941101, "learning_rate": 7.477856126977709e-07, "loss": 1.9714, "num_input_tokens_seen": 259845968, "step": 255200 }, { "epoch": 5.0264810694808135, "grad_norm": 1.958411455154419, "learning_rate": 7.476065100088674e-07, "loss": 1.9458, "num_input_tokens_seen": 259947256, "step": 255300 }, { "epoch": 5.028449922230316, "grad_norm": 1.9385576248168945, "learning_rate": 7.474273652165652e-07, "loss": 1.9767, "num_input_tokens_seen": 260049344, "step": 255400 }, { "epoch": 5.030418774979819, "grad_norm": 1.998084545135498, "learning_rate": 7.472481783513266e-07, "loss": 1.9445, "num_input_tokens_seen": 260151744, "step": 255500 }, { "epoch": 5.032387627729322, "grad_norm": 1.7117881774902344, "learning_rate": 7.470689494436204e-07, "loss": 1.9259, "num_input_tokens_seen": 260252792, "step": 255600 }, { "epoch": 5.034356480478825, "grad_norm": 1.8511531352996826, "learning_rate": 7.468896785239234e-07, "loss": 2.0109, "num_input_tokens_seen": 260354624, "step": 255700 }, { "epoch": 5.036325333228328, "grad_norm": 2.6892521381378174, "learning_rate": 7.467103656227189e-07, "loss": 1.9802, "num_input_tokens_seen": 260455456, "step": 255800 }, { "epoch": 5.038294185977831, "grad_norm": 2.0311710834503174, "learning_rate": 7.465310107704976e-07, "loss": 1.9659, "num_input_tokens_seen": 260557856, "step": 255900 }, { "epoch": 5.040263038727334, "grad_norm": 1.8495655059814453, "learning_rate": 7.463516139977573e-07, "loss": 1.9694, "num_input_tokens_seen": 260660256, "step": 256000 }, { "epoch": 5.0422318914768365, "grad_norm": 1.7853916883468628, "learning_rate": 7.461721753350027e-07, "loss": 2.0493, "num_input_tokens_seen": 260762656, "step": 256100 }, { "epoch": 5.044200744226339, "grad_norm": 2.032388687133789, "learning_rate": 7.459926948127462e-07, "loss": 1.9538, "num_input_tokens_seen": 260864416, "step": 256200 }, { "epoch": 5.046169596975842, "grad_norm": 2.157252550125122, "learning_rate": 7.458131724615067e-07, "loss": 1.9323, "num_input_tokens_seen": 260966816, "step": 256300 }, { "epoch": 5.048138449725345, "grad_norm": 2.6756553649902344, "learning_rate": 7.456336083118105e-07, "loss": 1.9893, "num_input_tokens_seen": 261067816, "step": 256400 }, { "epoch": 5.0501073024748475, "grad_norm": 1.8276116847991943, "learning_rate": 7.454540023941908e-07, "loss": 1.9601, "num_input_tokens_seen": 261170216, "step": 256500 }, { "epoch": 5.052076155224351, "grad_norm": 2.02026629447937, "learning_rate": 7.452743547391884e-07, "loss": 2.0006, "num_input_tokens_seen": 261271224, "step": 256600 }, { "epoch": 5.054045007973854, "grad_norm": 2.038952350616455, "learning_rate": 7.450946653773506e-07, "loss": 2.0365, "num_input_tokens_seen": 261372472, "step": 256700 }, { "epoch": 5.056013860723357, "grad_norm": 9.953147888183594, "learning_rate": 7.449149343392325e-07, "loss": 1.9775, "num_input_tokens_seen": 261472112, "step": 256800 }, { "epoch": 5.057982713472859, "grad_norm": 1.6050832271575928, "learning_rate": 7.447351616553951e-07, "loss": 1.9589, "num_input_tokens_seen": 261574512, "step": 256900 }, { "epoch": 5.059951566222362, "grad_norm": 1.9243571758270264, "learning_rate": 7.445553473564079e-07, "loss": 1.954, "num_input_tokens_seen": 261676912, "step": 257000 }, { "epoch": 5.061920418971865, "grad_norm": 1.702769160270691, "learning_rate": 7.443754914728465e-07, "loss": 2.0053, "num_input_tokens_seen": 261778688, "step": 257100 }, { "epoch": 5.063889271721368, "grad_norm": 5.400176048278809, "learning_rate": 7.44195594035294e-07, "loss": 1.9565, "num_input_tokens_seen": 261881088, "step": 257200 }, { "epoch": 5.0658581244708705, "grad_norm": 5.50039529800415, "learning_rate": 7.440156550743403e-07, "loss": 1.97, "num_input_tokens_seen": 261983488, "step": 257300 }, { "epoch": 5.067826977220374, "grad_norm": 1.8719820976257324, "learning_rate": 7.438356746205825e-07, "loss": 1.9726, "num_input_tokens_seen": 262085384, "step": 257400 }, { "epoch": 5.069795829969877, "grad_norm": 2.159721851348877, "learning_rate": 7.43655652704625e-07, "loss": 2.0127, "num_input_tokens_seen": 262187104, "step": 257500 }, { "epoch": 5.07176468271938, "grad_norm": 1.9829753637313843, "learning_rate": 7.434755893570787e-07, "loss": 1.9911, "num_input_tokens_seen": 262289400, "step": 257600 }, { "epoch": 5.073733535468882, "grad_norm": 1.8157498836517334, "learning_rate": 7.43295484608562e-07, "loss": 1.9775, "num_input_tokens_seen": 262391800, "step": 257700 }, { "epoch": 5.075702388218385, "grad_norm": 1.903546690940857, "learning_rate": 7.431153384897004e-07, "loss": 1.9711, "num_input_tokens_seen": 262493344, "step": 257800 }, { "epoch": 5.077671240967888, "grad_norm": 2.0336084365844727, "learning_rate": 7.42935151031126e-07, "loss": 1.9974, "num_input_tokens_seen": 262594912, "step": 257900 }, { "epoch": 5.079640093717391, "grad_norm": 2.0679564476013184, "learning_rate": 7.427549222634779e-07, "loss": 1.9706, "num_input_tokens_seen": 262697048, "step": 258000 }, { "epoch": 5.081608946466893, "grad_norm": 2.1089797019958496, "learning_rate": 7.42574652217403e-07, "loss": 1.9856, "num_input_tokens_seen": 262799448, "step": 258100 }, { "epoch": 5.083577799216396, "grad_norm": 2.1195034980773926, "learning_rate": 7.423943409235543e-07, "loss": 2.0093, "num_input_tokens_seen": 262901072, "step": 258200 }, { "epoch": 5.0855466519659, "grad_norm": 1.9945552349090576, "learning_rate": 7.422139884125924e-07, "loss": 2.0036, "num_input_tokens_seen": 263003472, "step": 258300 }, { "epoch": 5.087515504715403, "grad_norm": 1.980982780456543, "learning_rate": 7.420335947151849e-07, "loss": 2.0192, "num_input_tokens_seen": 263105160, "step": 258400 }, { "epoch": 5.089484357464905, "grad_norm": 1.9468029737472534, "learning_rate": 7.418531598620058e-07, "loss": 1.9804, "num_input_tokens_seen": 263205152, "step": 258500 }, { "epoch": 5.091453210214408, "grad_norm": 2.0993528366088867, "learning_rate": 7.416726838837368e-07, "loss": 1.9799, "num_input_tokens_seen": 263306312, "step": 258600 }, { "epoch": 5.093422062963911, "grad_norm": 2.0031890869140625, "learning_rate": 7.414921668110663e-07, "loss": 1.9747, "num_input_tokens_seen": 263408096, "step": 258700 }, { "epoch": 5.095390915713414, "grad_norm": 1.8896909952163696, "learning_rate": 7.413116086746897e-07, "loss": 2.0072, "num_input_tokens_seen": 263509784, "step": 258800 }, { "epoch": 5.097359768462916, "grad_norm": 1.9193813800811768, "learning_rate": 7.411310095053093e-07, "loss": 1.9938, "num_input_tokens_seen": 263611888, "step": 258900 }, { "epoch": 5.099328621212419, "grad_norm": 2.0843522548675537, "learning_rate": 7.409503693336346e-07, "loss": 1.9905, "num_input_tokens_seen": 263713360, "step": 259000 }, { "epoch": 5.101297473961923, "grad_norm": 2.25429630279541, "learning_rate": 7.407696881903818e-07, "loss": 2.0082, "num_input_tokens_seen": 263814176, "step": 259100 }, { "epoch": 5.1032663267114255, "grad_norm": 2.1886987686157227, "learning_rate": 7.405889661062743e-07, "loss": 1.999, "num_input_tokens_seen": 263915088, "step": 259200 }, { "epoch": 5.105235179460928, "grad_norm": 1.743133544921875, "learning_rate": 7.404082031120422e-07, "loss": 1.9333, "num_input_tokens_seen": 264016672, "step": 259300 }, { "epoch": 5.107204032210431, "grad_norm": 1.8552533388137817, "learning_rate": 7.40227399238423e-07, "loss": 1.994, "num_input_tokens_seen": 264118272, "step": 259400 }, { "epoch": 5.109172884959934, "grad_norm": 1.9470716714859009, "learning_rate": 7.400465545161607e-07, "loss": 1.9545, "num_input_tokens_seen": 264220672, "step": 259500 }, { "epoch": 5.111141737709437, "grad_norm": 1.7200884819030762, "learning_rate": 7.398656689760065e-07, "loss": 1.964, "num_input_tokens_seen": 264323072, "step": 259600 }, { "epoch": 5.113110590458939, "grad_norm": 1.9774038791656494, "learning_rate": 7.396847426487183e-07, "loss": 1.9751, "num_input_tokens_seen": 264424128, "step": 259700 }, { "epoch": 5.115079443208442, "grad_norm": 1.903700828552246, "learning_rate": 7.395037755650613e-07, "loss": 2.0092, "num_input_tokens_seen": 264525896, "step": 259800 }, { "epoch": 5.117048295957945, "grad_norm": 1.8798542022705078, "learning_rate": 7.393227677558072e-07, "loss": 1.9909, "num_input_tokens_seen": 264628296, "step": 259900 }, { "epoch": 5.1190171487074485, "grad_norm": 2.2121682167053223, "learning_rate": 7.391417192517352e-07, "loss": 1.9974, "num_input_tokens_seen": 264728216, "step": 260000 }, { "epoch": 5.120986001456951, "grad_norm": 1.7667813301086426, "learning_rate": 7.389606300836306e-07, "loss": 1.9967, "num_input_tokens_seen": 264830616, "step": 260100 }, { "epoch": 5.122954854206454, "grad_norm": 1.9347078800201416, "learning_rate": 7.387795002822866e-07, "loss": 1.9633, "num_input_tokens_seen": 264932528, "step": 260200 }, { "epoch": 5.124923706955957, "grad_norm": 11.775089263916016, "learning_rate": 7.385983298785023e-07, "loss": 1.9677, "num_input_tokens_seen": 265034360, "step": 260300 }, { "epoch": 5.1268925597054595, "grad_norm": 1.8936445713043213, "learning_rate": 7.384171189030848e-07, "loss": 1.9643, "num_input_tokens_seen": 265136048, "step": 260400 }, { "epoch": 5.128861412454962, "grad_norm": 1.8390278816223145, "learning_rate": 7.382358673868467e-07, "loss": 1.9801, "num_input_tokens_seen": 265237584, "step": 260500 }, { "epoch": 5.130830265204465, "grad_norm": 2.2224788665771484, "learning_rate": 7.380545753606091e-07, "loss": 2.0165, "num_input_tokens_seen": 265339432, "step": 260600 }, { "epoch": 5.132799117953968, "grad_norm": 2.1867525577545166, "learning_rate": 7.378732428551985e-07, "loss": 1.9829, "num_input_tokens_seen": 265440536, "step": 260700 }, { "epoch": 5.1347679707034715, "grad_norm": 1.8687938451766968, "learning_rate": 7.376918699014494e-07, "loss": 1.9595, "num_input_tokens_seen": 265542360, "step": 260800 }, { "epoch": 5.136736823452974, "grad_norm": 1.8443448543548584, "learning_rate": 7.375104565302027e-07, "loss": 1.9618, "num_input_tokens_seen": 265644760, "step": 260900 }, { "epoch": 5.138705676202477, "grad_norm": 2.1384170055389404, "learning_rate": 7.373290027723059e-07, "loss": 2.0161, "num_input_tokens_seen": 265747160, "step": 261000 }, { "epoch": 5.14067452895198, "grad_norm": 2.0410032272338867, "learning_rate": 7.37147508658614e-07, "loss": 2.0163, "num_input_tokens_seen": 265846320, "step": 261100 }, { "epoch": 5.1426433817014825, "grad_norm": 2.0527236461639404, "learning_rate": 7.369659742199886e-07, "loss": 1.9463, "num_input_tokens_seen": 265948720, "step": 261200 }, { "epoch": 5.144612234450985, "grad_norm": 2.003117322921753, "learning_rate": 7.367843994872978e-07, "loss": 2.0115, "num_input_tokens_seen": 266049864, "step": 261300 }, { "epoch": 5.146581087200488, "grad_norm": 1.7844789028167725, "learning_rate": 7.36602784491417e-07, "loss": 1.9823, "num_input_tokens_seen": 266152264, "step": 261400 }, { "epoch": 5.148549939949991, "grad_norm": 2.2948644161224365, "learning_rate": 7.364211292632283e-07, "loss": 1.9628, "num_input_tokens_seen": 266254664, "step": 261500 }, { "epoch": 5.150518792699494, "grad_norm": 11.594809532165527, "learning_rate": 7.362394338336206e-07, "loss": 1.9456, "num_input_tokens_seen": 266355376, "step": 261600 }, { "epoch": 5.152487645448997, "grad_norm": 1.8288902044296265, "learning_rate": 7.360576982334898e-07, "loss": 1.9937, "num_input_tokens_seen": 266457776, "step": 261700 }, { "epoch": 5.1544564981985, "grad_norm": 2.08296799659729, "learning_rate": 7.358759224937383e-07, "loss": 1.9522, "num_input_tokens_seen": 266560176, "step": 261800 }, { "epoch": 5.156425350948003, "grad_norm": 1.9673665761947632, "learning_rate": 7.356941066452757e-07, "loss": 1.9974, "num_input_tokens_seen": 266661616, "step": 261900 }, { "epoch": 5.1583942036975055, "grad_norm": 2.078920364379883, "learning_rate": 7.355122507190184e-07, "loss": 2.0107, "num_input_tokens_seen": 266762344, "step": 262000 }, { "epoch": 5.160363056447008, "grad_norm": 1.879564881324768, "learning_rate": 7.353303547458891e-07, "loss": 1.9975, "num_input_tokens_seen": 266864744, "step": 262100 }, { "epoch": 5.162331909196511, "grad_norm": 1.6097413301467896, "learning_rate": 7.351484187568177e-07, "loss": 1.9982, "num_input_tokens_seen": 266966320, "step": 262200 }, { "epoch": 5.164300761946014, "grad_norm": 1.8999217748641968, "learning_rate": 7.349664427827413e-07, "loss": 1.9637, "num_input_tokens_seen": 267068720, "step": 262300 }, { "epoch": 5.166269614695517, "grad_norm": 2.17246675491333, "learning_rate": 7.347844268546029e-07, "loss": 2.0391, "num_input_tokens_seen": 267170592, "step": 262400 }, { "epoch": 5.16823846744502, "grad_norm": 1.8745571374893188, "learning_rate": 7.346023710033531e-07, "loss": 2.0162, "num_input_tokens_seen": 267272944, "step": 262500 }, { "epoch": 5.170207320194523, "grad_norm": 1.9142630100250244, "learning_rate": 7.344202752599488e-07, "loss": 2.0217, "num_input_tokens_seen": 267375344, "step": 262600 }, { "epoch": 5.172176172944026, "grad_norm": 1.863242745399475, "learning_rate": 7.342381396553539e-07, "loss": 1.9593, "num_input_tokens_seen": 267477744, "step": 262700 }, { "epoch": 5.174145025693528, "grad_norm": 1.8339025974273682, "learning_rate": 7.340559642205388e-07, "loss": 2.0127, "num_input_tokens_seen": 267578648, "step": 262800 }, { "epoch": 5.176113878443031, "grad_norm": 1.8508989810943604, "learning_rate": 7.338737489864814e-07, "loss": 1.9917, "num_input_tokens_seen": 267681048, "step": 262900 }, { "epoch": 5.178082731192534, "grad_norm": 1.758082389831543, "learning_rate": 7.336914939841654e-07, "loss": 1.9616, "num_input_tokens_seen": 267783448, "step": 263000 }, { "epoch": 5.180051583942037, "grad_norm": 1.7795236110687256, "learning_rate": 7.335091992445819e-07, "loss": 1.9658, "num_input_tokens_seen": 267885848, "step": 263100 }, { "epoch": 5.1820204366915394, "grad_norm": 1.7803370952606201, "learning_rate": 7.333268647987286e-07, "loss": 1.995, "num_input_tokens_seen": 267987696, "step": 263200 }, { "epoch": 5.183989289441043, "grad_norm": 1.8964024782180786, "learning_rate": 7.3314449067761e-07, "loss": 1.961, "num_input_tokens_seen": 268090096, "step": 263300 }, { "epoch": 5.185958142190546, "grad_norm": 2.2240588665008545, "learning_rate": 7.329620769122371e-07, "loss": 2.0312, "num_input_tokens_seen": 268192496, "step": 263400 }, { "epoch": 5.187926994940049, "grad_norm": 1.8488363027572632, "learning_rate": 7.32779623533628e-07, "loss": 1.9876, "num_input_tokens_seen": 268294600, "step": 263500 }, { "epoch": 5.189895847689551, "grad_norm": 2.171096086502075, "learning_rate": 7.325971305728071e-07, "loss": 2.0159, "num_input_tokens_seen": 268395568, "step": 263600 }, { "epoch": 5.191864700439054, "grad_norm": 1.946254014968872, "learning_rate": 7.324145980608061e-07, "loss": 1.9888, "num_input_tokens_seen": 268497280, "step": 263700 }, { "epoch": 5.193833553188557, "grad_norm": 1.659362554550171, "learning_rate": 7.322320260286628e-07, "loss": 1.9865, "num_input_tokens_seen": 268598872, "step": 263800 }, { "epoch": 5.19580240593806, "grad_norm": 2.198559045791626, "learning_rate": 7.320494145074225e-07, "loss": 2.0033, "num_input_tokens_seen": 268701272, "step": 263900 }, { "epoch": 5.197771258687562, "grad_norm": 1.7130135297775269, "learning_rate": 7.318667635281362e-07, "loss": 2.0082, "num_input_tokens_seen": 268802096, "step": 264000 }, { "epoch": 5.199740111437066, "grad_norm": 2.05407977104187, "learning_rate": 7.316840731218626e-07, "loss": 1.9643, "num_input_tokens_seen": 268904496, "step": 264100 }, { "epoch": 5.201708964186569, "grad_norm": 1.9083105325698853, "learning_rate": 7.315013433196664e-07, "loss": 1.9745, "num_input_tokens_seen": 269005560, "step": 264200 }, { "epoch": 5.203677816936072, "grad_norm": 1.7571542263031006, "learning_rate": 7.313185741526194e-07, "loss": 1.9806, "num_input_tokens_seen": 269107960, "step": 264300 }, { "epoch": 5.205646669685574, "grad_norm": 2.0068085193634033, "learning_rate": 7.311357656517998e-07, "loss": 1.9971, "num_input_tokens_seen": 269210360, "step": 264400 }, { "epoch": 5.207615522435077, "grad_norm": 2.325728178024292, "learning_rate": 7.309529178482929e-07, "loss": 2.0123, "num_input_tokens_seen": 269312760, "step": 264500 }, { "epoch": 5.20958437518458, "grad_norm": 1.9679704904556274, "learning_rate": 7.3077003077319e-07, "loss": 1.975, "num_input_tokens_seen": 269413704, "step": 264600 }, { "epoch": 5.211553227934083, "grad_norm": 1.719275951385498, "learning_rate": 7.3058710445759e-07, "loss": 1.9636, "num_input_tokens_seen": 269514696, "step": 264700 }, { "epoch": 5.213522080683585, "grad_norm": 1.858886957168579, "learning_rate": 7.304041389325977e-07, "loss": 1.9479, "num_input_tokens_seen": 269616544, "step": 264800 }, { "epoch": 5.215490933433088, "grad_norm": 1.9864132404327393, "learning_rate": 7.302211342293248e-07, "loss": 1.9733, "num_input_tokens_seen": 269718400, "step": 264900 }, { "epoch": 5.217459786182592, "grad_norm": 1.702999234199524, "learning_rate": 7.300380903788899e-07, "loss": 1.9468, "num_input_tokens_seen": 269820800, "step": 265000 }, { "epoch": 5.2194286389320945, "grad_norm": 2.4030091762542725, "learning_rate": 7.298550074124177e-07, "loss": 1.9844, "num_input_tokens_seen": 269920656, "step": 265100 }, { "epoch": 5.221397491681597, "grad_norm": 2.1367692947387695, "learning_rate": 7.296718853610407e-07, "loss": 1.9744, "num_input_tokens_seen": 270022280, "step": 265200 }, { "epoch": 5.2233663444311, "grad_norm": 1.9486796855926514, "learning_rate": 7.294887242558962e-07, "loss": 2.0031, "num_input_tokens_seen": 270123712, "step": 265300 }, { "epoch": 5.225335197180603, "grad_norm": 2.01839280128479, "learning_rate": 7.293055241281301e-07, "loss": 2.0585, "num_input_tokens_seen": 270226112, "step": 265400 }, { "epoch": 5.227304049930106, "grad_norm": 1.9012913703918457, "learning_rate": 7.291222850088935e-07, "loss": 1.9708, "num_input_tokens_seen": 270328512, "step": 265500 }, { "epoch": 5.229272902679608, "grad_norm": 1.9223334789276123, "learning_rate": 7.289390069293448e-07, "loss": 2.0117, "num_input_tokens_seen": 270430912, "step": 265600 }, { "epoch": 5.231241755429111, "grad_norm": 2.0429482460021973, "learning_rate": 7.287556899206491e-07, "loss": 2.0293, "num_input_tokens_seen": 270533312, "step": 265700 }, { "epoch": 5.233210608178615, "grad_norm": 1.7632092237472534, "learning_rate": 7.285723340139776e-07, "loss": 1.991, "num_input_tokens_seen": 270635536, "step": 265800 }, { "epoch": 5.2351794609281175, "grad_norm": 1.9305258989334106, "learning_rate": 7.283889392405086e-07, "loss": 1.9637, "num_input_tokens_seen": 270737936, "step": 265900 }, { "epoch": 5.23714831367762, "grad_norm": 1.7483024597167969, "learning_rate": 7.282055056314269e-07, "loss": 2.0053, "num_input_tokens_seen": 270839576, "step": 266000 }, { "epoch": 5.239117166427123, "grad_norm": 1.9088778495788574, "learning_rate": 7.280220332179236e-07, "loss": 1.9983, "num_input_tokens_seen": 270941144, "step": 266100 }, { "epoch": 5.241086019176626, "grad_norm": 1.9415194988250732, "learning_rate": 7.278385220311969e-07, "loss": 2.0006, "num_input_tokens_seen": 271041760, "step": 266200 }, { "epoch": 5.2430548719261285, "grad_norm": 1.802951693534851, "learning_rate": 7.276549721024511e-07, "loss": 1.9903, "num_input_tokens_seen": 271143440, "step": 266300 }, { "epoch": 5.245023724675631, "grad_norm": 1.9440362453460693, "learning_rate": 7.274713834628976e-07, "loss": 1.9921, "num_input_tokens_seen": 271245072, "step": 266400 }, { "epoch": 5.246992577425134, "grad_norm": 1.9700713157653809, "learning_rate": 7.272877561437538e-07, "loss": 1.9831, "num_input_tokens_seen": 271346584, "step": 266500 }, { "epoch": 5.248961430174637, "grad_norm": 3.628180742263794, "learning_rate": 7.271040901762442e-07, "loss": 1.9835, "num_input_tokens_seen": 271447520, "step": 266600 }, { "epoch": 5.25093028292414, "grad_norm": 1.8094944953918457, "learning_rate": 7.269203855915994e-07, "loss": 1.9918, "num_input_tokens_seen": 271549344, "step": 266700 }, { "epoch": 5.252899135673643, "grad_norm": 2.1155598163604736, "learning_rate": 7.267366424210569e-07, "loss": 1.9705, "num_input_tokens_seen": 271651744, "step": 266800 }, { "epoch": 5.254867988423146, "grad_norm": 2.0874040126800537, "learning_rate": 7.265528606958608e-07, "loss": 1.9565, "num_input_tokens_seen": 271753728, "step": 266900 }, { "epoch": 5.256836841172649, "grad_norm": 2.0286948680877686, "learning_rate": 7.263690404472617e-07, "loss": 1.9715, "num_input_tokens_seen": 271856128, "step": 267000 }, { "epoch": 5.2588056939221515, "grad_norm": 2.1304943561553955, "learning_rate": 7.261851817065166e-07, "loss": 1.9564, "num_input_tokens_seen": 271957712, "step": 267100 }, { "epoch": 5.260774546671654, "grad_norm": 2.0912668704986572, "learning_rate": 7.260012845048889e-07, "loss": 1.9743, "num_input_tokens_seen": 272060112, "step": 267200 }, { "epoch": 5.262743399421157, "grad_norm": 1.8552043437957764, "learning_rate": 7.258173488736491e-07, "loss": 1.9784, "num_input_tokens_seen": 272161680, "step": 267300 }, { "epoch": 5.26471225217066, "grad_norm": 1.8491747379302979, "learning_rate": 7.256333748440738e-07, "loss": 1.989, "num_input_tokens_seen": 272261816, "step": 267400 }, { "epoch": 5.266681104920163, "grad_norm": 2.0953633785247803, "learning_rate": 7.254493624474459e-07, "loss": 1.9825, "num_input_tokens_seen": 272363528, "step": 267500 }, { "epoch": 5.268649957669666, "grad_norm": 1.7955294847488403, "learning_rate": 7.252653117150558e-07, "loss": 1.9691, "num_input_tokens_seen": 272465928, "step": 267600 }, { "epoch": 5.270618810419169, "grad_norm": 1.9059914350509644, "learning_rate": 7.250812226781992e-07, "loss": 1.9479, "num_input_tokens_seen": 272568328, "step": 267700 }, { "epoch": 5.272587663168672, "grad_norm": 1.9488171339035034, "learning_rate": 7.248970953681791e-07, "loss": 1.9611, "num_input_tokens_seen": 272669912, "step": 267800 }, { "epoch": 5.274556515918174, "grad_norm": 1.8518226146697998, "learning_rate": 7.247129298163047e-07, "loss": 1.9795, "num_input_tokens_seen": 272772312, "step": 267900 }, { "epoch": 5.276525368667677, "grad_norm": 2.598212957382202, "learning_rate": 7.245287260538922e-07, "loss": 1.9582, "num_input_tokens_seen": 272874712, "step": 268000 }, { "epoch": 5.27849422141718, "grad_norm": 2.0378260612487793, "learning_rate": 7.243444841122633e-07, "loss": 2.0094, "num_input_tokens_seen": 272976384, "step": 268100 }, { "epoch": 5.280463074166683, "grad_norm": 2.052414894104004, "learning_rate": 7.241602040227473e-07, "loss": 1.96, "num_input_tokens_seen": 273078784, "step": 268200 }, { "epoch": 5.282431926916186, "grad_norm": 2.1668992042541504, "learning_rate": 7.239758858166792e-07, "loss": 1.9581, "num_input_tokens_seen": 273181184, "step": 268300 }, { "epoch": 5.284400779665689, "grad_norm": 1.810660719871521, "learning_rate": 7.237915295254009e-07, "loss": 2.0088, "num_input_tokens_seen": 273283008, "step": 268400 }, { "epoch": 5.286369632415192, "grad_norm": 1.9220634698867798, "learning_rate": 7.236071351802604e-07, "loss": 1.9636, "num_input_tokens_seen": 273384560, "step": 268500 }, { "epoch": 5.288338485164695, "grad_norm": 1.7055270671844482, "learning_rate": 7.234227028126126e-07, "loss": 1.9836, "num_input_tokens_seen": 273486440, "step": 268600 }, { "epoch": 5.290307337914197, "grad_norm": 2.014674663543701, "learning_rate": 7.232382324538187e-07, "loss": 1.9769, "num_input_tokens_seen": 273588264, "step": 268700 }, { "epoch": 5.2922761906637, "grad_norm": 1.929937720298767, "learning_rate": 7.230537241352461e-07, "loss": 1.9866, "num_input_tokens_seen": 273690064, "step": 268800 }, { "epoch": 5.294245043413203, "grad_norm": 1.5885132551193237, "learning_rate": 7.228691778882692e-07, "loss": 1.9612, "num_input_tokens_seen": 273792464, "step": 268900 }, { "epoch": 5.296213896162706, "grad_norm": 1.7675259113311768, "learning_rate": 7.226845937442682e-07, "loss": 1.9569, "num_input_tokens_seen": 273894000, "step": 269000 }, { "epoch": 5.298182748912209, "grad_norm": 2.164607286453247, "learning_rate": 7.224999717346303e-07, "loss": 2.022, "num_input_tokens_seen": 273994816, "step": 269100 }, { "epoch": 5.300151601661712, "grad_norm": 2.145895004272461, "learning_rate": 7.223153118907489e-07, "loss": 2.0185, "num_input_tokens_seen": 274097216, "step": 269200 }, { "epoch": 5.302120454411215, "grad_norm": 1.8865959644317627, "learning_rate": 7.221306142440238e-07, "loss": 1.9796, "num_input_tokens_seen": 274199616, "step": 269300 }, { "epoch": 5.304089307160718, "grad_norm": 2.5805323123931885, "learning_rate": 7.219458788258613e-07, "loss": 2.0091, "num_input_tokens_seen": 274301400, "step": 269400 }, { "epoch": 5.30605815991022, "grad_norm": 1.778024673461914, "learning_rate": 7.21761105667674e-07, "loss": 1.988, "num_input_tokens_seen": 274403176, "step": 269500 }, { "epoch": 5.308027012659723, "grad_norm": 2.467095375061035, "learning_rate": 7.215762948008812e-07, "loss": 1.9659, "num_input_tokens_seen": 274504024, "step": 269600 }, { "epoch": 5.309995865409226, "grad_norm": 1.9283493757247925, "learning_rate": 7.213914462569081e-07, "loss": 1.9774, "num_input_tokens_seen": 274606136, "step": 269700 }, { "epoch": 5.311964718158729, "grad_norm": 1.9295318126678467, "learning_rate": 7.212065600671872e-07, "loss": 2.0061, "num_input_tokens_seen": 274706624, "step": 269800 }, { "epoch": 5.313933570908231, "grad_norm": 0.0, "learning_rate": 7.210216362631564e-07, "loss": 1.9677, "num_input_tokens_seen": 274808480, "step": 269900 }, { "epoch": 5.315902423657735, "grad_norm": 1.8951165676116943, "learning_rate": 7.208366748762604e-07, "loss": 1.9482, "num_input_tokens_seen": 274910880, "step": 270000 }, { "epoch": 5.317871276407238, "grad_norm": 1.9429322481155396, "learning_rate": 7.206516759379506e-07, "loss": 2.0014, "num_input_tokens_seen": 275012448, "step": 270100 }, { "epoch": 5.3198401291567405, "grad_norm": 1.852536916732788, "learning_rate": 7.204666394796843e-07, "loss": 1.96, "num_input_tokens_seen": 275113992, "step": 270200 }, { "epoch": 5.321808981906243, "grad_norm": 1.945887804031372, "learning_rate": 7.202815655329255e-07, "loss": 2.0011, "num_input_tokens_seen": 275216392, "step": 270300 }, { "epoch": 5.323777834655746, "grad_norm": 2.177349805831909, "learning_rate": 7.200964541291443e-07, "loss": 1.9628, "num_input_tokens_seen": 275318792, "step": 270400 }, { "epoch": 5.325746687405249, "grad_norm": 1.929465889930725, "learning_rate": 7.199113052998177e-07, "loss": 1.9621, "num_input_tokens_seen": 275420112, "step": 270500 }, { "epoch": 5.327715540154752, "grad_norm": 1.8120075464248657, "learning_rate": 7.197261190764283e-07, "loss": 2.0163, "num_input_tokens_seen": 275522512, "step": 270600 }, { "epoch": 5.329684392904254, "grad_norm": 1.9308222532272339, "learning_rate": 7.195408954904656e-07, "loss": 2.009, "num_input_tokens_seen": 275624912, "step": 270700 }, { "epoch": 5.331653245653758, "grad_norm": 2.230597496032715, "learning_rate": 7.193556345734253e-07, "loss": 1.9733, "num_input_tokens_seen": 275727312, "step": 270800 }, { "epoch": 5.333622098403261, "grad_norm": 1.8992881774902344, "learning_rate": 7.191703363568095e-07, "loss": 1.99, "num_input_tokens_seen": 275828920, "step": 270900 }, { "epoch": 5.3355909511527635, "grad_norm": 1.7125335931777954, "learning_rate": 7.189850008721264e-07, "loss": 1.9107, "num_input_tokens_seen": 275931320, "step": 271000 }, { "epoch": 5.337559803902266, "grad_norm": 1.9915541410446167, "learning_rate": 7.187996281508912e-07, "loss": 1.9711, "num_input_tokens_seen": 276033720, "step": 271100 }, { "epoch": 5.339528656651769, "grad_norm": 1.9223518371582031, "learning_rate": 7.186142182246245e-07, "loss": 2.0245, "num_input_tokens_seen": 276136120, "step": 271200 }, { "epoch": 5.341497509401272, "grad_norm": 2.25526762008667, "learning_rate": 7.18428771124854e-07, "loss": 1.9876, "num_input_tokens_seen": 276237960, "step": 271300 }, { "epoch": 5.3434663621507745, "grad_norm": 2.335881471633911, "learning_rate": 7.182432868831131e-07, "loss": 1.9671, "num_input_tokens_seen": 276340360, "step": 271400 }, { "epoch": 5.345435214900277, "grad_norm": 1.7091542482376099, "learning_rate": 7.180577655309422e-07, "loss": 1.9718, "num_input_tokens_seen": 276442760, "step": 271500 }, { "epoch": 5.34740406764978, "grad_norm": 2.0681424140930176, "learning_rate": 7.178722070998872e-07, "loss": 2.0358, "num_input_tokens_seen": 276545112, "step": 271600 }, { "epoch": 5.349372920399284, "grad_norm": 1.8593565225601196, "learning_rate": 7.176866116215014e-07, "loss": 1.9698, "num_input_tokens_seen": 276647512, "step": 271700 }, { "epoch": 5.3513417731487865, "grad_norm": 2.043499708175659, "learning_rate": 7.17500979127343e-07, "loss": 2.0253, "num_input_tokens_seen": 276749544, "step": 271800 }, { "epoch": 5.353310625898289, "grad_norm": 2.8592991828918457, "learning_rate": 7.173153096489776e-07, "loss": 1.9873, "num_input_tokens_seen": 276851944, "step": 271900 }, { "epoch": 5.355279478647792, "grad_norm": 2.0717766284942627, "learning_rate": 7.171296032179768e-07, "loss": 1.9718, "num_input_tokens_seen": 276954344, "step": 272000 }, { "epoch": 5.357248331397295, "grad_norm": 1.941322922706604, "learning_rate": 7.169438598659185e-07, "loss": 2.0223, "num_input_tokens_seen": 277056200, "step": 272100 }, { "epoch": 5.3592171841467975, "grad_norm": 2.0059738159179688, "learning_rate": 7.167580796243864e-07, "loss": 1.9871, "num_input_tokens_seen": 277158600, "step": 272200 }, { "epoch": 5.3611860368963, "grad_norm": 3.2781922817230225, "learning_rate": 7.165722625249712e-07, "loss": 2.0257, "num_input_tokens_seen": 277260912, "step": 272300 }, { "epoch": 5.363154889645803, "grad_norm": 1.8294517993927002, "learning_rate": 7.163864085992693e-07, "loss": 2.0039, "num_input_tokens_seen": 277361752, "step": 272400 }, { "epoch": 5.365123742395307, "grad_norm": 1.742654800415039, "learning_rate": 7.162005178788839e-07, "loss": 1.9659, "num_input_tokens_seen": 277464152, "step": 272500 }, { "epoch": 5.367092595144809, "grad_norm": 1.9192051887512207, "learning_rate": 7.160145903954238e-07, "loss": 1.9592, "num_input_tokens_seen": 277566552, "step": 272600 }, { "epoch": 5.369061447894312, "grad_norm": 2.0247249603271484, "learning_rate": 7.158286261805046e-07, "loss": 1.9748, "num_input_tokens_seen": 277667784, "step": 272700 }, { "epoch": 5.371030300643815, "grad_norm": 2.092761278152466, "learning_rate": 7.156426252657478e-07, "loss": 1.9688, "num_input_tokens_seen": 277770184, "step": 272800 }, { "epoch": 5.372999153393318, "grad_norm": 1.821317434310913, "learning_rate": 7.154565876827814e-07, "loss": 1.9271, "num_input_tokens_seen": 277872584, "step": 272900 }, { "epoch": 5.37496800614282, "grad_norm": 1.9703651666641235, "learning_rate": 7.152705134632396e-07, "loss": 1.9833, "num_input_tokens_seen": 277974440, "step": 273000 }, { "epoch": 5.376936858892323, "grad_norm": 2.015634059906006, "learning_rate": 7.150844026387627e-07, "loss": 2.0069, "num_input_tokens_seen": 278075432, "step": 273100 }, { "epoch": 5.378905711641826, "grad_norm": 1.8306469917297363, "learning_rate": 7.148982552409972e-07, "loss": 1.9866, "num_input_tokens_seen": 278177832, "step": 273200 }, { "epoch": 5.380874564391329, "grad_norm": 5.945248126983643, "learning_rate": 7.147120713015958e-07, "loss": 1.9769, "num_input_tokens_seen": 278280232, "step": 273300 }, { "epoch": 5.382843417140832, "grad_norm": 1.8421549797058105, "learning_rate": 7.145258508522178e-07, "loss": 1.9737, "num_input_tokens_seen": 278382632, "step": 273400 }, { "epoch": 5.384812269890335, "grad_norm": 2.0037624835968018, "learning_rate": 7.143395939245282e-07, "loss": 1.9702, "num_input_tokens_seen": 278485032, "step": 273500 }, { "epoch": 5.386781122639838, "grad_norm": 1.753377079963684, "learning_rate": 7.141533005501985e-07, "loss": 2.0035, "num_input_tokens_seen": 278587432, "step": 273600 }, { "epoch": 5.388749975389341, "grad_norm": 1.9795148372650146, "learning_rate": 7.139669707609063e-07, "loss": 1.9657, "num_input_tokens_seen": 278688496, "step": 273700 }, { "epoch": 5.390718828138843, "grad_norm": 1.8461229801177979, "learning_rate": 7.137806045883353e-07, "loss": 2.0223, "num_input_tokens_seen": 278790512, "step": 273800 }, { "epoch": 5.392687680888346, "grad_norm": 1.8531826734542847, "learning_rate": 7.135942020641757e-07, "loss": 1.9799, "num_input_tokens_seen": 278892256, "step": 273900 }, { "epoch": 5.394656533637849, "grad_norm": 2.0050060749053955, "learning_rate": 7.134077632201236e-07, "loss": 2.0154, "num_input_tokens_seen": 278992488, "step": 274000 }, { "epoch": 5.396625386387352, "grad_norm": 1.7922987937927246, "learning_rate": 7.132212880878812e-07, "loss": 2.0385, "num_input_tokens_seen": 279094224, "step": 274100 }, { "epoch": 5.398594239136855, "grad_norm": 2.1020994186401367, "learning_rate": 7.130347766991572e-07, "loss": 1.9438, "num_input_tokens_seen": 279195992, "step": 274200 }, { "epoch": 5.400563091886358, "grad_norm": 1.7647255659103394, "learning_rate": 7.128482290856663e-07, "loss": 1.9707, "num_input_tokens_seen": 279297160, "step": 274300 }, { "epoch": 5.402531944635861, "grad_norm": 2.0838844776153564, "learning_rate": 7.126616452791293e-07, "loss": 1.9841, "num_input_tokens_seen": 279399072, "step": 274400 }, { "epoch": 5.404500797385364, "grad_norm": 1.7502622604370117, "learning_rate": 7.124750253112732e-07, "loss": 1.9721, "num_input_tokens_seen": 279501472, "step": 274500 }, { "epoch": 5.406469650134866, "grad_norm": 2.0231852531433105, "learning_rate": 7.12288369213831e-07, "loss": 1.99, "num_input_tokens_seen": 279603168, "step": 274600 }, { "epoch": 5.408438502884369, "grad_norm": 1.9301915168762207, "learning_rate": 7.121016770185423e-07, "loss": 1.9967, "num_input_tokens_seen": 279705568, "step": 274700 }, { "epoch": 5.410407355633872, "grad_norm": 1.983007788658142, "learning_rate": 7.119149487571523e-07, "loss": 1.9451, "num_input_tokens_seen": 279807000, "step": 274800 }, { "epoch": 5.412376208383375, "grad_norm": 1.8708869218826294, "learning_rate": 7.117281844614126e-07, "loss": 1.9594, "num_input_tokens_seen": 279909400, "step": 274900 }, { "epoch": 5.414345061132878, "grad_norm": 1.9879542589187622, "learning_rate": 7.115413841630809e-07, "loss": 2.0033, "num_input_tokens_seen": 280011800, "step": 275000 }, { "epoch": 5.416313913882381, "grad_norm": 1.6958532333374023, "learning_rate": 7.113545478939211e-07, "loss": 2.0198, "num_input_tokens_seen": 280113360, "step": 275100 }, { "epoch": 5.418282766631884, "grad_norm": 1.8821911811828613, "learning_rate": 7.111676756857033e-07, "loss": 1.9338, "num_input_tokens_seen": 280215760, "step": 275200 }, { "epoch": 5.4202516193813866, "grad_norm": 1.8032538890838623, "learning_rate": 7.109807675702029e-07, "loss": 2.0059, "num_input_tokens_seen": 280317640, "step": 275300 }, { "epoch": 5.422220472130889, "grad_norm": 1.7668246030807495, "learning_rate": 7.107938235792028e-07, "loss": 1.9434, "num_input_tokens_seen": 280419664, "step": 275400 }, { "epoch": 5.424189324880392, "grad_norm": 1.779638409614563, "learning_rate": 7.106068437444908e-07, "loss": 1.9485, "num_input_tokens_seen": 280521488, "step": 275500 }, { "epoch": 5.426158177629895, "grad_norm": 1.7614749670028687, "learning_rate": 7.104198280978614e-07, "loss": 2.0248, "num_input_tokens_seen": 280623888, "step": 275600 }, { "epoch": 5.428127030379398, "grad_norm": 2.207742929458618, "learning_rate": 7.102327766711149e-07, "loss": 1.9829, "num_input_tokens_seen": 280725144, "step": 275700 }, { "epoch": 5.430095883128901, "grad_norm": 1.989269495010376, "learning_rate": 7.100456894960581e-07, "loss": 1.9831, "num_input_tokens_seen": 280826584, "step": 275800 }, { "epoch": 5.432064735878404, "grad_norm": 2.120246171951294, "learning_rate": 7.098585666045032e-07, "loss": 1.972, "num_input_tokens_seen": 280928320, "step": 275900 }, { "epoch": 5.434033588627907, "grad_norm": 1.8878870010375977, "learning_rate": 7.096714080282691e-07, "loss": 1.9603, "num_input_tokens_seen": 281030720, "step": 276000 }, { "epoch": 5.4360024413774095, "grad_norm": 2.058067560195923, "learning_rate": 7.094842137991804e-07, "loss": 1.9911, "num_input_tokens_seen": 281133120, "step": 276100 }, { "epoch": 5.437971294126912, "grad_norm": 1.9082746505737305, "learning_rate": 7.092969839490683e-07, "loss": 2.0134, "num_input_tokens_seen": 281234952, "step": 276200 }, { "epoch": 5.439940146876415, "grad_norm": 2.6323392391204834, "learning_rate": 7.091097185097692e-07, "loss": 1.9603, "num_input_tokens_seen": 281336728, "step": 276300 }, { "epoch": 5.441908999625918, "grad_norm": 2.902038812637329, "learning_rate": 7.089224175131263e-07, "loss": 2.0159, "num_input_tokens_seen": 281438664, "step": 276400 }, { "epoch": 5.4438778523754205, "grad_norm": 2.4794204235076904, "learning_rate": 7.087350809909883e-07, "loss": 1.9942, "num_input_tokens_seen": 281539616, "step": 276500 }, { "epoch": 5.445846705124923, "grad_norm": 1.9060282707214355, "learning_rate": 7.085477089752106e-07, "loss": 1.9664, "num_input_tokens_seen": 281641496, "step": 276600 }, { "epoch": 5.447815557874427, "grad_norm": 1.8969968557357788, "learning_rate": 7.083603014976536e-07, "loss": 2.0323, "num_input_tokens_seen": 281742592, "step": 276700 }, { "epoch": 5.44978441062393, "grad_norm": 1.7251744270324707, "learning_rate": 7.08172858590185e-07, "loss": 2.0215, "num_input_tokens_seen": 281844384, "step": 276800 }, { "epoch": 5.4517532633734325, "grad_norm": 1.742017149925232, "learning_rate": 7.079853802846773e-07, "loss": 1.9482, "num_input_tokens_seen": 281946136, "step": 276900 }, { "epoch": 5.453722116122935, "grad_norm": 2.905759811401367, "learning_rate": 7.077978666130103e-07, "loss": 1.9417, "num_input_tokens_seen": 282048536, "step": 277000 }, { "epoch": 5.455690968872438, "grad_norm": 1.920806884765625, "learning_rate": 7.076103176070685e-07, "loss": 1.9708, "num_input_tokens_seen": 282150936, "step": 277100 }, { "epoch": 5.457659821621941, "grad_norm": 1.9064234495162964, "learning_rate": 7.074227332987432e-07, "loss": 1.9656, "num_input_tokens_seen": 282252712, "step": 277200 }, { "epoch": 5.4596286743714435, "grad_norm": 1.929622769355774, "learning_rate": 7.072351137199317e-07, "loss": 2.004, "num_input_tokens_seen": 282355112, "step": 277300 }, { "epoch": 5.461597527120946, "grad_norm": 1.8693277835845947, "learning_rate": 7.070474589025371e-07, "loss": 1.9707, "num_input_tokens_seen": 282456920, "step": 277400 }, { "epoch": 5.46356637987045, "grad_norm": 1.8607865571975708, "learning_rate": 7.068597688784683e-07, "loss": 1.995, "num_input_tokens_seen": 282557656, "step": 277500 }, { "epoch": 5.465535232619953, "grad_norm": 2.2434775829315186, "learning_rate": 7.066720436796405e-07, "loss": 1.9817, "num_input_tokens_seen": 282660056, "step": 277600 }, { "epoch": 5.467504085369455, "grad_norm": 1.734452486038208, "learning_rate": 7.064842833379749e-07, "loss": 1.9861, "num_input_tokens_seen": 282760936, "step": 277700 }, { "epoch": 5.469472938118958, "grad_norm": 1.921830177307129, "learning_rate": 7.062964878853985e-07, "loss": 1.994, "num_input_tokens_seen": 282863336, "step": 277800 }, { "epoch": 5.471441790868461, "grad_norm": 1.9665969610214233, "learning_rate": 7.061086573538444e-07, "loss": 1.9454, "num_input_tokens_seen": 282964928, "step": 277900 }, { "epoch": 5.473410643617964, "grad_norm": 1.9603980779647827, "learning_rate": 7.059207917752512e-07, "loss": 1.9857, "num_input_tokens_seen": 283066736, "step": 278000 }, { "epoch": 5.4753794963674665, "grad_norm": 1.923351764678955, "learning_rate": 7.057328911815644e-07, "loss": 1.9627, "num_input_tokens_seen": 283167920, "step": 278100 }, { "epoch": 5.477348349116969, "grad_norm": 1.6758246421813965, "learning_rate": 7.055449556047347e-07, "loss": 2.0062, "num_input_tokens_seen": 283269752, "step": 278200 }, { "epoch": 5.479317201866472, "grad_norm": 2.052189826965332, "learning_rate": 7.053569850767189e-07, "loss": 1.9848, "num_input_tokens_seen": 283370360, "step": 278300 }, { "epoch": 5.481286054615976, "grad_norm": 2.209536552429199, "learning_rate": 7.051689796294799e-07, "loss": 2.0248, "num_input_tokens_seen": 283471992, "step": 278400 }, { "epoch": 5.483254907365478, "grad_norm": 1.984963297843933, "learning_rate": 7.049809392949863e-07, "loss": 1.9933, "num_input_tokens_seen": 283574392, "step": 278500 }, { "epoch": 5.485223760114981, "grad_norm": 1.7700587511062622, "learning_rate": 7.04792864105213e-07, "loss": 2.0123, "num_input_tokens_seen": 283676200, "step": 278600 }, { "epoch": 5.487192612864484, "grad_norm": 4.987680435180664, "learning_rate": 7.046047540921403e-07, "loss": 1.9983, "num_input_tokens_seen": 283777928, "step": 278700 }, { "epoch": 5.489161465613987, "grad_norm": 2.0759313106536865, "learning_rate": 7.044166092877551e-07, "loss": 1.9652, "num_input_tokens_seen": 283880328, "step": 278800 }, { "epoch": 5.491130318363489, "grad_norm": 1.8324307203292847, "learning_rate": 7.042284297240496e-07, "loss": 2.0007, "num_input_tokens_seen": 283980984, "step": 278900 }, { "epoch": 5.493099171112992, "grad_norm": 1.8795981407165527, "learning_rate": 7.040402154330218e-07, "loss": 2.0202, "num_input_tokens_seen": 284083384, "step": 279000 }, { "epoch": 5.495068023862495, "grad_norm": 1.7920551300048828, "learning_rate": 7.038519664466767e-07, "loss": 2.0162, "num_input_tokens_seen": 284185312, "step": 279100 }, { "epoch": 5.497036876611999, "grad_norm": 1.9927595853805542, "learning_rate": 7.036636827970238e-07, "loss": 1.9652, "num_input_tokens_seen": 284287072, "step": 279200 }, { "epoch": 5.499005729361501, "grad_norm": 1.9401782751083374, "learning_rate": 7.034753645160796e-07, "loss": 2.0085, "num_input_tokens_seen": 284389472, "step": 279300 }, { "epoch": 5.500974582111004, "grad_norm": 1.854786992073059, "learning_rate": 7.032870116358658e-07, "loss": 2.0226, "num_input_tokens_seen": 284491160, "step": 279400 }, { "epoch": 5.502943434860507, "grad_norm": 1.9346381425857544, "learning_rate": 7.030986241884104e-07, "loss": 1.996, "num_input_tokens_seen": 284593560, "step": 279500 }, { "epoch": 5.50491228761001, "grad_norm": 2.018704891204834, "learning_rate": 7.029102022057467e-07, "loss": 1.9927, "num_input_tokens_seen": 284695960, "step": 279600 }, { "epoch": 5.506881140359512, "grad_norm": 1.7590324878692627, "learning_rate": 7.027217457199149e-07, "loss": 2.0303, "num_input_tokens_seen": 284797496, "step": 279700 }, { "epoch": 5.508849993109015, "grad_norm": 1.8581695556640625, "learning_rate": 7.025332547629598e-07, "loss": 2.0225, "num_input_tokens_seen": 284898824, "step": 279800 }, { "epoch": 5.510818845858518, "grad_norm": 1.8246877193450928, "learning_rate": 7.023447293669332e-07, "loss": 1.9667, "num_input_tokens_seen": 285000416, "step": 279900 }, { "epoch": 5.512787698608021, "grad_norm": 1.817454218864441, "learning_rate": 7.021561695638917e-07, "loss": 1.9313, "num_input_tokens_seen": 285102816, "step": 280000 }, { "epoch": 5.514756551357524, "grad_norm": 2.2149713039398193, "learning_rate": 7.019675753858987e-07, "loss": 2.0217, "num_input_tokens_seen": 285204384, "step": 280100 }, { "epoch": 5.516725404107027, "grad_norm": 2.0778284072875977, "learning_rate": 7.01778946865023e-07, "loss": 1.9988, "num_input_tokens_seen": 285306048, "step": 280200 }, { "epoch": 5.51869425685653, "grad_norm": 1.8702492713928223, "learning_rate": 7.015902840333394e-07, "loss": 2.0129, "num_input_tokens_seen": 285408448, "step": 280300 }, { "epoch": 5.520663109606033, "grad_norm": 1.7011958360671997, "learning_rate": 7.014015869229283e-07, "loss": 1.9638, "num_input_tokens_seen": 285510352, "step": 280400 }, { "epoch": 5.522631962355535, "grad_norm": 2.153801441192627, "learning_rate": 7.01212855565876e-07, "loss": 2.0367, "num_input_tokens_seen": 285611592, "step": 280500 }, { "epoch": 5.524600815105038, "grad_norm": 1.9386348724365234, "learning_rate": 7.010240899942747e-07, "loss": 1.97, "num_input_tokens_seen": 285713376, "step": 280600 }, { "epoch": 5.526569667854541, "grad_norm": 1.938071846961975, "learning_rate": 7.008352902402224e-07, "loss": 2.0261, "num_input_tokens_seen": 285815128, "step": 280700 }, { "epoch": 5.5285385206040445, "grad_norm": 1.8405362367630005, "learning_rate": 7.00646456335823e-07, "loss": 1.9813, "num_input_tokens_seen": 285916920, "step": 280800 }, { "epoch": 5.530507373353547, "grad_norm": 1.87881338596344, "learning_rate": 7.004575883131859e-07, "loss": 1.9776, "num_input_tokens_seen": 286019320, "step": 280900 }, { "epoch": 5.53247622610305, "grad_norm": 2.1490933895111084, "learning_rate": 7.002686862044266e-07, "loss": 1.9733, "num_input_tokens_seen": 286121720, "step": 281000 }, { "epoch": 5.534445078852553, "grad_norm": 2.014657974243164, "learning_rate": 7.000797500416665e-07, "loss": 1.9733, "num_input_tokens_seen": 286224120, "step": 281100 }, { "epoch": 5.5364139316020555, "grad_norm": 2.160707712173462, "learning_rate": 6.998907798570325e-07, "loss": 1.998, "num_input_tokens_seen": 286325336, "step": 281200 }, { "epoch": 5.538382784351558, "grad_norm": 2.2819178104400635, "learning_rate": 6.997017756826572e-07, "loss": 1.9691, "num_input_tokens_seen": 286427256, "step": 281300 }, { "epoch": 5.540351637101061, "grad_norm": 2.0642409324645996, "learning_rate": 6.995127375506794e-07, "loss": 1.9654, "num_input_tokens_seen": 286529656, "step": 281400 }, { "epoch": 5.542320489850564, "grad_norm": 2.2802693843841553, "learning_rate": 6.993236654932433e-07, "loss": 1.9904, "num_input_tokens_seen": 286631064, "step": 281500 }, { "epoch": 5.544289342600067, "grad_norm": 1.6591362953186035, "learning_rate": 6.991345595424991e-07, "loss": 1.9814, "num_input_tokens_seen": 286733464, "step": 281600 }, { "epoch": 5.546258195349569, "grad_norm": 1.9292830228805542, "learning_rate": 6.989454197306027e-07, "loss": 2.0289, "num_input_tokens_seen": 286835120, "step": 281700 }, { "epoch": 5.548227048099073, "grad_norm": 2.751769542694092, "learning_rate": 6.987562460897155e-07, "loss": 1.979, "num_input_tokens_seen": 286937520, "step": 281800 }, { "epoch": 5.550195900848576, "grad_norm": 2.3896677494049072, "learning_rate": 6.98567038652005e-07, "loss": 1.9442, "num_input_tokens_seen": 287039920, "step": 281900 }, { "epoch": 5.5521647535980785, "grad_norm": 1.998288869857788, "learning_rate": 6.983777974496446e-07, "loss": 1.9655, "num_input_tokens_seen": 287142320, "step": 282000 }, { "epoch": 5.554133606347581, "grad_norm": 1.8365534543991089, "learning_rate": 6.981885225148127e-07, "loss": 1.9509, "num_input_tokens_seen": 287244720, "step": 282100 }, { "epoch": 5.556102459097084, "grad_norm": 1.9905551671981812, "learning_rate": 6.979992138796941e-07, "loss": 2.0188, "num_input_tokens_seen": 287346560, "step": 282200 }, { "epoch": 5.558071311846587, "grad_norm": 2.0359601974487305, "learning_rate": 6.978098715764794e-07, "loss": 2.0305, "num_input_tokens_seen": 287447432, "step": 282300 }, { "epoch": 5.5600401645960895, "grad_norm": 2.004528522491455, "learning_rate": 6.976204956373645e-07, "loss": 1.9655, "num_input_tokens_seen": 287549832, "step": 282400 }, { "epoch": 5.562009017345593, "grad_norm": 1.9784183502197266, "learning_rate": 6.974310860945509e-07, "loss": 1.9744, "num_input_tokens_seen": 287651728, "step": 282500 }, { "epoch": 5.563977870095096, "grad_norm": 1.8037654161453247, "learning_rate": 6.972416429802465e-07, "loss": 1.9985, "num_input_tokens_seen": 287754128, "step": 282600 }, { "epoch": 5.565946722844599, "grad_norm": 1.8915928602218628, "learning_rate": 6.970521663266642e-07, "loss": 1.9789, "num_input_tokens_seen": 287855848, "step": 282700 }, { "epoch": 5.567915575594101, "grad_norm": 1.8359853029251099, "learning_rate": 6.968626561660231e-07, "loss": 1.9898, "num_input_tokens_seen": 287958248, "step": 282800 }, { "epoch": 5.569884428343604, "grad_norm": 1.9419841766357422, "learning_rate": 6.966731125305476e-07, "loss": 1.9427, "num_input_tokens_seen": 288060648, "step": 282900 }, { "epoch": 5.571853281093107, "grad_norm": 4.6133294105529785, "learning_rate": 6.964835354524684e-07, "loss": 2.001, "num_input_tokens_seen": 288161920, "step": 283000 }, { "epoch": 5.57382213384261, "grad_norm": 1.8769828081130981, "learning_rate": 6.962939249640209e-07, "loss": 1.9891, "num_input_tokens_seen": 288264320, "step": 283100 }, { "epoch": 5.5757909865921125, "grad_norm": 2.1598141193389893, "learning_rate": 6.961042810974474e-07, "loss": 1.9779, "num_input_tokens_seen": 288365376, "step": 283200 }, { "epoch": 5.577759839341615, "grad_norm": 1.9685794115066528, "learning_rate": 6.959146038849945e-07, "loss": 1.9889, "num_input_tokens_seen": 288467776, "step": 283300 }, { "epoch": 5.579728692091119, "grad_norm": 5.988781929016113, "learning_rate": 6.957248933589161e-07, "loss": 2.011, "num_input_tokens_seen": 288569528, "step": 283400 }, { "epoch": 5.581697544840622, "grad_norm": 1.9712460041046143, "learning_rate": 6.955351495514701e-07, "loss": 1.9834, "num_input_tokens_seen": 288671928, "step": 283500 }, { "epoch": 5.583666397590124, "grad_norm": 1.7132648229599, "learning_rate": 6.953453724949212e-07, "loss": 2.0101, "num_input_tokens_seen": 288774328, "step": 283600 }, { "epoch": 5.585635250339627, "grad_norm": 2.162921905517578, "learning_rate": 6.951555622215393e-07, "loss": 2.0175, "num_input_tokens_seen": 288875096, "step": 283700 }, { "epoch": 5.58760410308913, "grad_norm": 2.0548007488250732, "learning_rate": 6.949657187636003e-07, "loss": 2.0156, "num_input_tokens_seen": 288977016, "step": 283800 }, { "epoch": 5.589572955838633, "grad_norm": 2.019831418991089, "learning_rate": 6.947758421533849e-07, "loss": 1.9681, "num_input_tokens_seen": 289078216, "step": 283900 }, { "epoch": 5.591541808588135, "grad_norm": 2.0104689598083496, "learning_rate": 6.945859324231806e-07, "loss": 1.9827, "num_input_tokens_seen": 289180616, "step": 284000 }, { "epoch": 5.593510661337638, "grad_norm": 2.0804810523986816, "learning_rate": 6.943959896052796e-07, "loss": 2.0707, "num_input_tokens_seen": 289280832, "step": 284100 }, { "epoch": 5.595479514087142, "grad_norm": 1.8354921340942383, "learning_rate": 6.942060137319802e-07, "loss": 2.032, "num_input_tokens_seen": 289383232, "step": 284200 }, { "epoch": 5.597448366836645, "grad_norm": 2.3384897708892822, "learning_rate": 6.940160048355861e-07, "loss": 1.9833, "num_input_tokens_seen": 289485632, "step": 284300 }, { "epoch": 5.599417219586147, "grad_norm": 1.866071343421936, "learning_rate": 6.938259629484069e-07, "loss": 2.0212, "num_input_tokens_seen": 289587584, "step": 284400 }, { "epoch": 5.60138607233565, "grad_norm": 1.974873661994934, "learning_rate": 6.936358881027575e-07, "loss": 1.9549, "num_input_tokens_seen": 289688536, "step": 284500 }, { "epoch": 5.603354925085153, "grad_norm": 2.2453863620758057, "learning_rate": 6.934457803309585e-07, "loss": 1.9611, "num_input_tokens_seen": 289790936, "step": 284600 }, { "epoch": 5.605323777834656, "grad_norm": 1.9215408563613892, "learning_rate": 6.93255639665336e-07, "loss": 1.9793, "num_input_tokens_seen": 289893336, "step": 284700 }, { "epoch": 5.607292630584158, "grad_norm": 2.227081537246704, "learning_rate": 6.930654661382223e-07, "loss": 2.0022, "num_input_tokens_seen": 289994992, "step": 284800 }, { "epoch": 5.609261483333661, "grad_norm": 2.0311901569366455, "learning_rate": 6.928752597819541e-07, "loss": 2.0112, "num_input_tokens_seen": 290096792, "step": 284900 }, { "epoch": 5.611230336083164, "grad_norm": 1.6489031314849854, "learning_rate": 6.92685020628875e-07, "loss": 1.9136, "num_input_tokens_seen": 290199192, "step": 285000 }, { "epoch": 5.6131991888326676, "grad_norm": 2.2252256870269775, "learning_rate": 6.924947487113333e-07, "loss": 1.9666, "num_input_tokens_seen": 290300824, "step": 285100 }, { "epoch": 5.61516804158217, "grad_norm": 1.861838459968567, "learning_rate": 6.923044440616831e-07, "loss": 1.974, "num_input_tokens_seen": 290402608, "step": 285200 }, { "epoch": 5.617136894331673, "grad_norm": 2.2326159477233887, "learning_rate": 6.921141067122842e-07, "loss": 2.0539, "num_input_tokens_seen": 290504248, "step": 285300 }, { "epoch": 5.619105747081176, "grad_norm": 1.9460946321487427, "learning_rate": 6.919237366955017e-07, "loss": 2.0213, "num_input_tokens_seen": 290606024, "step": 285400 }, { "epoch": 5.621074599830679, "grad_norm": 1.9875304698944092, "learning_rate": 6.917333340437067e-07, "loss": 1.9549, "num_input_tokens_seen": 290707120, "step": 285500 }, { "epoch": 5.623043452580181, "grad_norm": 2.336749315261841, "learning_rate": 6.915428987892753e-07, "loss": 2.0025, "num_input_tokens_seen": 290809520, "step": 285600 }, { "epoch": 5.625012305329684, "grad_norm": 2.5193111896514893, "learning_rate": 6.913524309645897e-07, "loss": 1.996, "num_input_tokens_seen": 290910528, "step": 285700 }, { "epoch": 5.626981158079188, "grad_norm": 1.9238159656524658, "learning_rate": 6.911619306020368e-07, "loss": 1.9955, "num_input_tokens_seen": 291012456, "step": 285800 }, { "epoch": 5.6289500108286905, "grad_norm": 1.997316837310791, "learning_rate": 6.909713977340102e-07, "loss": 1.9883, "num_input_tokens_seen": 291113584, "step": 285900 }, { "epoch": 5.630918863578193, "grad_norm": 2.033946990966797, "learning_rate": 6.90780832392908e-07, "loss": 2.0086, "num_input_tokens_seen": 291215224, "step": 286000 }, { "epoch": 5.632887716327696, "grad_norm": 1.861080288887024, "learning_rate": 6.905902346111346e-07, "loss": 2.0076, "num_input_tokens_seen": 291315200, "step": 286100 }, { "epoch": 5.634856569077199, "grad_norm": 2.0090553760528564, "learning_rate": 6.90399604421099e-07, "loss": 1.9033, "num_input_tokens_seen": 291417600, "step": 286200 }, { "epoch": 5.6368254218267015, "grad_norm": 1.9354575872421265, "learning_rate": 6.902089418552169e-07, "loss": 1.9776, "num_input_tokens_seen": 291519184, "step": 286300 }, { "epoch": 5.638794274576204, "grad_norm": 1.853554368019104, "learning_rate": 6.900182469459082e-07, "loss": 1.9789, "num_input_tokens_seen": 291621584, "step": 286400 }, { "epoch": 5.640763127325707, "grad_norm": 1.9268100261688232, "learning_rate": 6.898275197255996e-07, "loss": 1.9895, "num_input_tokens_seen": 291723360, "step": 286500 }, { "epoch": 5.64273198007521, "grad_norm": 8.665900230407715, "learning_rate": 6.896367602267221e-07, "loss": 2.0115, "num_input_tokens_seen": 291825400, "step": 286600 }, { "epoch": 5.644700832824713, "grad_norm": 2.1298089027404785, "learning_rate": 6.894459684817133e-07, "loss": 1.9229, "num_input_tokens_seen": 291927800, "step": 286700 }, { "epoch": 5.646669685574216, "grad_norm": 1.7074607610702515, "learning_rate": 6.89255144523015e-07, "loss": 1.9744, "num_input_tokens_seen": 292030200, "step": 286800 }, { "epoch": 5.648638538323719, "grad_norm": 1.8821911811828613, "learning_rate": 6.89064288383076e-07, "loss": 1.9755, "num_input_tokens_seen": 292131136, "step": 286900 }, { "epoch": 5.650607391073222, "grad_norm": 1.807858943939209, "learning_rate": 6.888734000943493e-07, "loss": 2.0074, "num_input_tokens_seen": 292232536, "step": 287000 }, { "epoch": 5.6525762438227245, "grad_norm": 2.4728779792785645, "learning_rate": 6.886824796892939e-07, "loss": 1.9892, "num_input_tokens_seen": 292334296, "step": 287100 }, { "epoch": 5.654545096572227, "grad_norm": 1.7749592065811157, "learning_rate": 6.88491527200374e-07, "loss": 1.967, "num_input_tokens_seen": 292436696, "step": 287200 }, { "epoch": 5.65651394932173, "grad_norm": 2.0146358013153076, "learning_rate": 6.883005426600601e-07, "loss": 1.9608, "num_input_tokens_seen": 292538504, "step": 287300 }, { "epoch": 5.658482802071233, "grad_norm": 2.08988356590271, "learning_rate": 6.88109526100827e-07, "loss": 1.9634, "num_input_tokens_seen": 292640904, "step": 287400 }, { "epoch": 5.660451654820736, "grad_norm": 1.7986937761306763, "learning_rate": 6.879184775551556e-07, "loss": 1.9478, "num_input_tokens_seen": 292743304, "step": 287500 }, { "epoch": 5.662420507570239, "grad_norm": 1.8585352897644043, "learning_rate": 6.87727397055532e-07, "loss": 1.988, "num_input_tokens_seen": 292844304, "step": 287600 }, { "epoch": 5.664389360319742, "grad_norm": 1.8912304639816284, "learning_rate": 6.875362846344479e-07, "loss": 1.986, "num_input_tokens_seen": 292945192, "step": 287700 }, { "epoch": 5.666358213069245, "grad_norm": 1.752672791481018, "learning_rate": 6.873451403244004e-07, "loss": 1.9173, "num_input_tokens_seen": 293047176, "step": 287800 }, { "epoch": 5.6683270658187475, "grad_norm": 2.038039207458496, "learning_rate": 6.871539641578919e-07, "loss": 2.0048, "num_input_tokens_seen": 293148896, "step": 287900 }, { "epoch": 5.67029591856825, "grad_norm": 1.8853589296340942, "learning_rate": 6.869627561674303e-07, "loss": 1.9975, "num_input_tokens_seen": 293251296, "step": 288000 }, { "epoch": 5.672264771317753, "grad_norm": 2.0846149921417236, "learning_rate": 6.86771516385529e-07, "loss": 1.9658, "num_input_tokens_seen": 293353696, "step": 288100 }, { "epoch": 5.674233624067256, "grad_norm": 1.9113783836364746, "learning_rate": 6.865802448447064e-07, "loss": 1.9763, "num_input_tokens_seen": 293454416, "step": 288200 }, { "epoch": 5.6762024768167585, "grad_norm": 1.9036356210708618, "learning_rate": 6.863889415774872e-07, "loss": 2.0191, "num_input_tokens_seen": 293555320, "step": 288300 }, { "epoch": 5.678171329566261, "grad_norm": 2.454892635345459, "learning_rate": 6.861976066164004e-07, "loss": 1.9844, "num_input_tokens_seen": 293657720, "step": 288400 }, { "epoch": 5.680140182315765, "grad_norm": 1.8267836570739746, "learning_rate": 6.86006239993981e-07, "loss": 2.0506, "num_input_tokens_seen": 293760120, "step": 288500 }, { "epoch": 5.682109035065268, "grad_norm": 2.0792393684387207, "learning_rate": 6.858148417427693e-07, "loss": 1.961, "num_input_tokens_seen": 293862520, "step": 288600 }, { "epoch": 5.68407788781477, "grad_norm": 1.9794563055038452, "learning_rate": 6.856234118953112e-07, "loss": 1.9825, "num_input_tokens_seen": 293964288, "step": 288700 }, { "epoch": 5.686046740564273, "grad_norm": 2.4365408420562744, "learning_rate": 6.854319504841575e-07, "loss": 1.9683, "num_input_tokens_seen": 294066688, "step": 288800 }, { "epoch": 5.688015593313776, "grad_norm": 1.8940140008926392, "learning_rate": 6.852404575418647e-07, "loss": 2.0323, "num_input_tokens_seen": 294168744, "step": 288900 }, { "epoch": 5.689984446063279, "grad_norm": 1.882495403289795, "learning_rate": 6.850489331009942e-07, "loss": 2.0024, "num_input_tokens_seen": 294269424, "step": 289000 }, { "epoch": 5.6919532988127814, "grad_norm": 2.120837688446045, "learning_rate": 6.848573771941137e-07, "loss": 1.9956, "num_input_tokens_seen": 294371160, "step": 289100 }, { "epoch": 5.693922151562285, "grad_norm": 2.1796810626983643, "learning_rate": 6.846657898537951e-07, "loss": 2.0071, "num_input_tokens_seen": 294473560, "step": 289200 }, { "epoch": 5.695891004311788, "grad_norm": 1.8778839111328125, "learning_rate": 6.844741711126165e-07, "loss": 1.9922, "num_input_tokens_seen": 294575816, "step": 289300 }, { "epoch": 5.697859857061291, "grad_norm": 2.173163414001465, "learning_rate": 6.842825210031612e-07, "loss": 1.9967, "num_input_tokens_seen": 294678216, "step": 289400 }, { "epoch": 5.699828709810793, "grad_norm": 2.1021974086761475, "learning_rate": 6.840908395580174e-07, "loss": 1.9757, "num_input_tokens_seen": 294780616, "step": 289500 }, { "epoch": 5.701797562560296, "grad_norm": 1.8285257816314697, "learning_rate": 6.838991268097791e-07, "loss": 1.9713, "num_input_tokens_seen": 294883016, "step": 289600 }, { "epoch": 5.703766415309799, "grad_norm": 1.738957166671753, "learning_rate": 6.837073827910453e-07, "loss": 1.9793, "num_input_tokens_seen": 294984552, "step": 289700 }, { "epoch": 5.705735268059302, "grad_norm": 1.7961680889129639, "learning_rate": 6.835156075344206e-07, "loss": 1.9934, "num_input_tokens_seen": 295085672, "step": 289800 }, { "epoch": 5.707704120808804, "grad_norm": 1.8706741333007812, "learning_rate": 6.833238010725146e-07, "loss": 1.9639, "num_input_tokens_seen": 295188072, "step": 289900 }, { "epoch": 5.709672973558307, "grad_norm": 1.7334696054458618, "learning_rate": 6.831319634379426e-07, "loss": 1.9379, "num_input_tokens_seen": 295290472, "step": 290000 }, { "epoch": 5.711641826307811, "grad_norm": 1.8002521991729736, "learning_rate": 6.829400946633245e-07, "loss": 1.9655, "num_input_tokens_seen": 295392408, "step": 290100 }, { "epoch": 5.713610679057314, "grad_norm": 1.7615959644317627, "learning_rate": 6.827481947812865e-07, "loss": 1.9791, "num_input_tokens_seen": 295493256, "step": 290200 }, { "epoch": 5.715579531806816, "grad_norm": 2.7020058631896973, "learning_rate": 6.825562638244593e-07, "loss": 1.9498, "num_input_tokens_seen": 295594432, "step": 290300 }, { "epoch": 5.717548384556319, "grad_norm": 1.7554099559783936, "learning_rate": 6.823643018254794e-07, "loss": 1.9931, "num_input_tokens_seen": 295696832, "step": 290400 }, { "epoch": 5.719517237305822, "grad_norm": 1.9533312320709229, "learning_rate": 6.821723088169879e-07, "loss": 1.9763, "num_input_tokens_seen": 295798408, "step": 290500 }, { "epoch": 5.721486090055325, "grad_norm": 2.0266520977020264, "learning_rate": 6.819802848316318e-07, "loss": 1.9509, "num_input_tokens_seen": 295900808, "step": 290600 }, { "epoch": 5.723454942804827, "grad_norm": 2.1518337726593018, "learning_rate": 6.817882299020631e-07, "loss": 1.9939, "num_input_tokens_seen": 296002648, "step": 290700 }, { "epoch": 5.72542379555433, "grad_norm": 1.8498213291168213, "learning_rate": 6.815961440609393e-07, "loss": 2.0672, "num_input_tokens_seen": 296103424, "step": 290800 }, { "epoch": 5.727392648303834, "grad_norm": 2.0848705768585205, "learning_rate": 6.814040273409229e-07, "loss": 1.978, "num_input_tokens_seen": 296204248, "step": 290900 }, { "epoch": 5.7293615010533365, "grad_norm": 1.7975051403045654, "learning_rate": 6.812118797746817e-07, "loss": 2.104, "num_input_tokens_seen": 296305504, "step": 291000 }, { "epoch": 5.731330353802839, "grad_norm": 1.978773593902588, "learning_rate": 6.810197013948885e-07, "loss": 1.9739, "num_input_tokens_seen": 296407904, "step": 291100 }, { "epoch": 5.733299206552342, "grad_norm": 1.8710181713104248, "learning_rate": 6.808274922342221e-07, "loss": 1.9535, "num_input_tokens_seen": 296507648, "step": 291200 }, { "epoch": 5.735268059301845, "grad_norm": 1.9824811220169067, "learning_rate": 6.806352523253655e-07, "loss": 1.9836, "num_input_tokens_seen": 296609736, "step": 291300 }, { "epoch": 5.737236912051348, "grad_norm": 1.8356417417526245, "learning_rate": 6.804429817010082e-07, "loss": 2.0004, "num_input_tokens_seen": 296710632, "step": 291400 }, { "epoch": 5.73920576480085, "grad_norm": 2.2513487339019775, "learning_rate": 6.802506803938436e-07, "loss": 2.0035, "num_input_tokens_seen": 296811680, "step": 291500 }, { "epoch": 5.741174617550353, "grad_norm": 2.213818073272705, "learning_rate": 6.800583484365712e-07, "loss": 1.9697, "num_input_tokens_seen": 296912344, "step": 291600 }, { "epoch": 5.743143470299856, "grad_norm": 1.8014044761657715, "learning_rate": 6.798659858618951e-07, "loss": 1.9509, "num_input_tokens_seen": 297013824, "step": 291700 }, { "epoch": 5.7451123230493595, "grad_norm": 1.8372653722763062, "learning_rate": 6.796735927025255e-07, "loss": 2.0054, "num_input_tokens_seen": 297116224, "step": 291800 }, { "epoch": 5.747081175798862, "grad_norm": 1.8215463161468506, "learning_rate": 6.794811689911766e-07, "loss": 2.0063, "num_input_tokens_seen": 297217696, "step": 291900 }, { "epoch": 5.749050028548365, "grad_norm": 1.8149486780166626, "learning_rate": 6.792887147605689e-07, "loss": 1.9432, "num_input_tokens_seen": 297319328, "step": 292000 }, { "epoch": 5.751018881297868, "grad_norm": 1.895106315612793, "learning_rate": 6.790962300434273e-07, "loss": 1.983, "num_input_tokens_seen": 297421288, "step": 292100 }, { "epoch": 5.7529877340473705, "grad_norm": 1.8608224391937256, "learning_rate": 6.789037148724826e-07, "loss": 1.9708, "num_input_tokens_seen": 297523688, "step": 292200 }, { "epoch": 5.754956586796873, "grad_norm": 2.3314852714538574, "learning_rate": 6.787111692804698e-07, "loss": 1.9471, "num_input_tokens_seen": 297624984, "step": 292300 }, { "epoch": 5.756925439546376, "grad_norm": 2.004523277282715, "learning_rate": 6.785185933001302e-07, "loss": 2.043, "num_input_tokens_seen": 297727384, "step": 292400 }, { "epoch": 5.75889429229588, "grad_norm": 1.8335539102554321, "learning_rate": 6.783259869642094e-07, "loss": 1.924, "num_input_tokens_seen": 297829080, "step": 292500 }, { "epoch": 5.760863145045382, "grad_norm": 1.7369937896728516, "learning_rate": 6.781333503054587e-07, "loss": 1.9722, "num_input_tokens_seen": 297931480, "step": 292600 }, { "epoch": 5.762831997794885, "grad_norm": 1.826288104057312, "learning_rate": 6.779406833566341e-07, "loss": 2.0215, "num_input_tokens_seen": 298033880, "step": 292700 }, { "epoch": 5.764800850544388, "grad_norm": 1.8336760997772217, "learning_rate": 6.777479861504973e-07, "loss": 1.9846, "num_input_tokens_seen": 298135456, "step": 292800 }, { "epoch": 5.766769703293891, "grad_norm": 2.0768070220947266, "learning_rate": 6.775552587198144e-07, "loss": 1.981, "num_input_tokens_seen": 298237856, "step": 292900 }, { "epoch": 5.7687385560433935, "grad_norm": 2.022739887237549, "learning_rate": 6.773625010973576e-07, "loss": 2.046, "num_input_tokens_seen": 298338064, "step": 293000 }, { "epoch": 5.770707408792896, "grad_norm": 1.801545262336731, "learning_rate": 6.771697133159033e-07, "loss": 2.0324, "num_input_tokens_seen": 298438440, "step": 293100 }, { "epoch": 5.772676261542399, "grad_norm": 2.091960906982422, "learning_rate": 6.769768954082335e-07, "loss": 1.9919, "num_input_tokens_seen": 298540840, "step": 293200 }, { "epoch": 5.774645114291902, "grad_norm": 2.9989755153656006, "learning_rate": 6.767840474071354e-07, "loss": 2.0144, "num_input_tokens_seen": 298643168, "step": 293300 }, { "epoch": 5.7766139670414045, "grad_norm": 1.9307326078414917, "learning_rate": 6.76591169345401e-07, "loss": 1.9693, "num_input_tokens_seen": 298745568, "step": 293400 }, { "epoch": 5.778582819790908, "grad_norm": 1.7557299137115479, "learning_rate": 6.763982612558279e-07, "loss": 1.9826, "num_input_tokens_seen": 298847968, "step": 293500 }, { "epoch": 5.780551672540411, "grad_norm": 2.4569568634033203, "learning_rate": 6.762053231712181e-07, "loss": 2.0128, "num_input_tokens_seen": 298948504, "step": 293600 }, { "epoch": 5.782520525289914, "grad_norm": 2.366541624069214, "learning_rate": 6.760123551243795e-07, "loss": 2.0273, "num_input_tokens_seen": 299049216, "step": 293700 }, { "epoch": 5.784489378039416, "grad_norm": 2.058173179626465, "learning_rate": 6.758193571481242e-07, "loss": 1.9591, "num_input_tokens_seen": 299151016, "step": 293800 }, { "epoch": 5.786458230788919, "grad_norm": 1.9351673126220703, "learning_rate": 6.756263292752703e-07, "loss": 2.005, "num_input_tokens_seen": 299252600, "step": 293900 }, { "epoch": 5.788427083538422, "grad_norm": 2.0175981521606445, "learning_rate": 6.754332715386403e-07, "loss": 1.9876, "num_input_tokens_seen": 299354504, "step": 294000 }, { "epoch": 5.790395936287925, "grad_norm": 1.9232161045074463, "learning_rate": 6.752401839710623e-07, "loss": 1.9929, "num_input_tokens_seen": 299456624, "step": 294100 }, { "epoch": 5.792364789037428, "grad_norm": 2.2448158264160156, "learning_rate": 6.750470666053689e-07, "loss": 1.9901, "num_input_tokens_seen": 299559024, "step": 294200 }, { "epoch": 5.794333641786931, "grad_norm": 1.9506829977035522, "learning_rate": 6.748539194743983e-07, "loss": 1.9729, "num_input_tokens_seen": 299660120, "step": 294300 }, { "epoch": 5.796302494536434, "grad_norm": 4.236745357513428, "learning_rate": 6.746607426109935e-07, "loss": 2.0085, "num_input_tokens_seen": 299761664, "step": 294400 }, { "epoch": 5.798271347285937, "grad_norm": 2.086859941482544, "learning_rate": 6.744675360480026e-07, "loss": 1.9446, "num_input_tokens_seen": 299864064, "step": 294500 }, { "epoch": 5.800240200035439, "grad_norm": 3.206986427307129, "learning_rate": 6.742742998182786e-07, "loss": 1.9954, "num_input_tokens_seen": 299966464, "step": 294600 }, { "epoch": 5.802209052784942, "grad_norm": 2.541658401489258, "learning_rate": 6.740810339546801e-07, "loss": 1.9703, "num_input_tokens_seen": 300068864, "step": 294700 }, { "epoch": 5.804177905534445, "grad_norm": 1.910529375076294, "learning_rate": 6.738877384900698e-07, "loss": 1.9419, "num_input_tokens_seen": 300171264, "step": 294800 }, { "epoch": 5.806146758283948, "grad_norm": 2.054974317550659, "learning_rate": 6.736944134573162e-07, "loss": 1.9759, "num_input_tokens_seen": 300273664, "step": 294900 }, { "epoch": 5.80811561103345, "grad_norm": 1.8165878057479858, "learning_rate": 6.735010588892926e-07, "loss": 2.006, "num_input_tokens_seen": 300375192, "step": 295000 }, { "epoch": 5.810084463782953, "grad_norm": 1.8727779388427734, "learning_rate": 6.733076748188775e-07, "loss": 1.9794, "num_input_tokens_seen": 300475400, "step": 295100 }, { "epoch": 5.812053316532457, "grad_norm": 1.6799296140670776, "learning_rate": 6.731142612789539e-07, "loss": 1.9873, "num_input_tokens_seen": 300577360, "step": 295200 }, { "epoch": 5.81402216928196, "grad_norm": 2.0564098358154297, "learning_rate": 6.729208183024102e-07, "loss": 2.0005, "num_input_tokens_seen": 300679760, "step": 295300 }, { "epoch": 5.815991022031462, "grad_norm": 1.930026650428772, "learning_rate": 6.727273459221399e-07, "loss": 1.9458, "num_input_tokens_seen": 300782160, "step": 295400 }, { "epoch": 5.817959874780965, "grad_norm": 2.0773508548736572, "learning_rate": 6.725338441710413e-07, "loss": 1.9488, "num_input_tokens_seen": 300884560, "step": 295500 }, { "epoch": 5.819928727530468, "grad_norm": 2.026898145675659, "learning_rate": 6.723403130820177e-07, "loss": 1.9835, "num_input_tokens_seen": 300986424, "step": 295600 }, { "epoch": 5.821897580279971, "grad_norm": 1.7822896242141724, "learning_rate": 6.721467526879775e-07, "loss": 1.9954, "num_input_tokens_seen": 301088824, "step": 295700 }, { "epoch": 5.823866433029473, "grad_norm": 2.170503616333008, "learning_rate": 6.719531630218339e-07, "loss": 1.9929, "num_input_tokens_seen": 301191224, "step": 295800 }, { "epoch": 5.825835285778977, "grad_norm": 1.8892829418182373, "learning_rate": 6.717595441165053e-07, "loss": 1.9756, "num_input_tokens_seen": 301293624, "step": 295900 }, { "epoch": 5.82780413852848, "grad_norm": 1.8213739395141602, "learning_rate": 6.715658960049148e-07, "loss": 1.9704, "num_input_tokens_seen": 301395480, "step": 296000 }, { "epoch": 5.8297729912779825, "grad_norm": 3.8382740020751953, "learning_rate": 6.71372218719991e-07, "loss": 1.9791, "num_input_tokens_seen": 301497128, "step": 296100 }, { "epoch": 5.831741844027485, "grad_norm": 1.7518540620803833, "learning_rate": 6.711785122946665e-07, "loss": 2.0157, "num_input_tokens_seen": 301598712, "step": 296200 }, { "epoch": 5.833710696776988, "grad_norm": 3.1765055656433105, "learning_rate": 6.709847767618799e-07, "loss": 2.0557, "num_input_tokens_seen": 301701112, "step": 296300 }, { "epoch": 5.835679549526491, "grad_norm": 1.9587994813919067, "learning_rate": 6.707910121545741e-07, "loss": 1.9887, "num_input_tokens_seen": 301801120, "step": 296400 }, { "epoch": 5.837648402275994, "grad_norm": 2.064427137374878, "learning_rate": 6.705972185056974e-07, "loss": 1.973, "num_input_tokens_seen": 301902472, "step": 296500 }, { "epoch": 5.839617255025496, "grad_norm": 2.107823371887207, "learning_rate": 6.704033958482023e-07, "loss": 2.0214, "num_input_tokens_seen": 302002760, "step": 296600 }, { "epoch": 5.841586107774999, "grad_norm": 1.9233169555664062, "learning_rate": 6.702095442150473e-07, "loss": 2.0106, "num_input_tokens_seen": 302105160, "step": 296700 }, { "epoch": 5.843554960524503, "grad_norm": 1.7996037006378174, "learning_rate": 6.700156636391946e-07, "loss": 2.0012, "num_input_tokens_seen": 302206168, "step": 296800 }, { "epoch": 5.8455238132740055, "grad_norm": 1.813899278640747, "learning_rate": 6.698217541536126e-07, "loss": 1.9686, "num_input_tokens_seen": 302307472, "step": 296900 }, { "epoch": 5.847492666023508, "grad_norm": 1.7590276002883911, "learning_rate": 6.696278157912734e-07, "loss": 1.9898, "num_input_tokens_seen": 302409872, "step": 297000 }, { "epoch": 5.849461518773011, "grad_norm": 1.7217687368392944, "learning_rate": 6.69433848585155e-07, "loss": 1.9624, "num_input_tokens_seen": 302512272, "step": 297100 }, { "epoch": 5.851430371522514, "grad_norm": 1.771230697631836, "learning_rate": 6.692398525682396e-07, "loss": 1.9856, "num_input_tokens_seen": 302613304, "step": 297200 }, { "epoch": 5.8533992242720165, "grad_norm": 2.3142309188842773, "learning_rate": 6.690458277735147e-07, "loss": 1.9771, "num_input_tokens_seen": 302715704, "step": 297300 }, { "epoch": 5.855368077021519, "grad_norm": 2.4750351905822754, "learning_rate": 6.688517742339726e-07, "loss": 1.9781, "num_input_tokens_seen": 302817512, "step": 297400 }, { "epoch": 5.857336929771022, "grad_norm": 2.156783103942871, "learning_rate": 6.686576919826104e-07, "loss": 1.9988, "num_input_tokens_seen": 302918136, "step": 297500 }, { "epoch": 5.859305782520526, "grad_norm": 1.8108576536178589, "learning_rate": 6.684635810524303e-07, "loss": 1.9895, "num_input_tokens_seen": 303019304, "step": 297600 }, { "epoch": 5.8612746352700285, "grad_norm": 1.674936056137085, "learning_rate": 6.682694414764391e-07, "loss": 1.986, "num_input_tokens_seen": 303121704, "step": 297700 }, { "epoch": 5.863243488019531, "grad_norm": 1.9008970260620117, "learning_rate": 6.680752732876487e-07, "loss": 1.9639, "num_input_tokens_seen": 303221752, "step": 297800 }, { "epoch": 5.865212340769034, "grad_norm": 1.8959661722183228, "learning_rate": 6.678810765190756e-07, "loss": 2.0142, "num_input_tokens_seen": 303323392, "step": 297900 }, { "epoch": 5.867181193518537, "grad_norm": 1.9603074789047241, "learning_rate": 6.676868512037414e-07, "loss": 1.9939, "num_input_tokens_seen": 303425792, "step": 298000 }, { "epoch": 5.8691500462680395, "grad_norm": 1.7947169542312622, "learning_rate": 6.674925973746725e-07, "loss": 1.966, "num_input_tokens_seen": 303527608, "step": 298100 }, { "epoch": 5.871118899017542, "grad_norm": 2.029937744140625, "learning_rate": 6.672983150649003e-07, "loss": 2.004, "num_input_tokens_seen": 303629160, "step": 298200 }, { "epoch": 5.873087751767045, "grad_norm": 2.1655921936035156, "learning_rate": 6.671040043074605e-07, "loss": 1.9543, "num_input_tokens_seen": 303731464, "step": 298300 }, { "epoch": 5.875056604516548, "grad_norm": 1.9153037071228027, "learning_rate": 6.669096651353941e-07, "loss": 1.9276, "num_input_tokens_seen": 303833104, "step": 298400 }, { "epoch": 5.877025457266051, "grad_norm": 1.830942153930664, "learning_rate": 6.66715297581747e-07, "loss": 1.9594, "num_input_tokens_seen": 303935504, "step": 298500 }, { "epoch": 5.878994310015554, "grad_norm": 1.5987343788146973, "learning_rate": 6.665209016795698e-07, "loss": 1.9259, "num_input_tokens_seen": 304037304, "step": 298600 }, { "epoch": 5.880963162765057, "grad_norm": 2.0710251331329346, "learning_rate": 6.663264774619177e-07, "loss": 2.0013, "num_input_tokens_seen": 304138928, "step": 298700 }, { "epoch": 5.88293201551456, "grad_norm": 1.8655959367752075, "learning_rate": 6.66132024961851e-07, "loss": 1.9836, "num_input_tokens_seen": 304241328, "step": 298800 }, { "epoch": 5.8849008682640624, "grad_norm": 2.1698358058929443, "learning_rate": 6.659375442124346e-07, "loss": 1.9933, "num_input_tokens_seen": 304343728, "step": 298900 }, { "epoch": 5.886869721013565, "grad_norm": 1.8794077634811401, "learning_rate": 6.657430352467384e-07, "loss": 2.0005, "num_input_tokens_seen": 304446128, "step": 299000 }, { "epoch": 5.888838573763068, "grad_norm": 1.8287192583084106, "learning_rate": 6.655484980978369e-07, "loss": 2.0001, "num_input_tokens_seen": 304547992, "step": 299100 }, { "epoch": 5.890807426512572, "grad_norm": 2.011369228363037, "learning_rate": 6.653539327988099e-07, "loss": 2.0099, "num_input_tokens_seen": 304649976, "step": 299200 }, { "epoch": 5.892776279262074, "grad_norm": 2.1407384872436523, "learning_rate": 6.65159339382741e-07, "loss": 1.9703, "num_input_tokens_seen": 304751824, "step": 299300 }, { "epoch": 5.894745132011577, "grad_norm": 2.0718042850494385, "learning_rate": 6.649647178827196e-07, "loss": 2.0043, "num_input_tokens_seen": 304854224, "step": 299400 }, { "epoch": 5.89671398476108, "grad_norm": 2.4641711711883545, "learning_rate": 6.647700683318393e-07, "loss": 1.9998, "num_input_tokens_seen": 304956624, "step": 299500 }, { "epoch": 5.898682837510583, "grad_norm": 1.9418220520019531, "learning_rate": 6.645753907631985e-07, "loss": 1.9958, "num_input_tokens_seen": 305059024, "step": 299600 }, { "epoch": 5.900651690260085, "grad_norm": 2.018656015396118, "learning_rate": 6.643806852099008e-07, "loss": 1.9783, "num_input_tokens_seen": 305159496, "step": 299700 }, { "epoch": 5.902620543009588, "grad_norm": 1.9650826454162598, "learning_rate": 6.641859517050539e-07, "loss": 1.9963, "num_input_tokens_seen": 305261896, "step": 299800 }, { "epoch": 5.904589395759091, "grad_norm": 1.747464895248413, "learning_rate": 6.639911902817706e-07, "loss": 2.0045, "num_input_tokens_seen": 305364296, "step": 299900 }, { "epoch": 5.906558248508594, "grad_norm": 2.020000696182251, "learning_rate": 6.637964009731688e-07, "loss": 1.9278, "num_input_tokens_seen": 305466144, "step": 300000 }, { "epoch": 5.908527101258096, "grad_norm": 1.8583580255508423, "learning_rate": 6.636015838123705e-07, "loss": 1.9974, "num_input_tokens_seen": 305568544, "step": 300100 }, { "epoch": 5.9104959540076, "grad_norm": 1.9425022602081299, "learning_rate": 6.634067388325027e-07, "loss": 1.9641, "num_input_tokens_seen": 305670424, "step": 300200 }, { "epoch": 5.912464806757103, "grad_norm": 1.978251338005066, "learning_rate": 6.632118660666971e-07, "loss": 1.9954, "num_input_tokens_seen": 305772208, "step": 300300 }, { "epoch": 5.914433659506606, "grad_norm": 2.0952775478363037, "learning_rate": 6.630169655480903e-07, "loss": 2.0143, "num_input_tokens_seen": 305874256, "step": 300400 }, { "epoch": 5.916402512256108, "grad_norm": 2.00018572807312, "learning_rate": 6.628220373098234e-07, "loss": 1.9657, "num_input_tokens_seen": 305975320, "step": 300500 }, { "epoch": 5.918371365005611, "grad_norm": 2.504668951034546, "learning_rate": 6.626270813850425e-07, "loss": 1.9915, "num_input_tokens_seen": 306077192, "step": 300600 }, { "epoch": 5.920340217755114, "grad_norm": 1.8748530149459839, "learning_rate": 6.624320978068981e-07, "loss": 1.9738, "num_input_tokens_seen": 306178816, "step": 300700 }, { "epoch": 5.922309070504617, "grad_norm": 1.890470266342163, "learning_rate": 6.622370866085454e-07, "loss": 1.9791, "num_input_tokens_seen": 306281216, "step": 300800 }, { "epoch": 5.92427792325412, "grad_norm": 2.4180619716644287, "learning_rate": 6.620420478231445e-07, "loss": 1.9553, "num_input_tokens_seen": 306382984, "step": 300900 }, { "epoch": 5.926246776003623, "grad_norm": 2.6123709678649902, "learning_rate": 6.618469814838601e-07, "loss": 2.0037, "num_input_tokens_seen": 306483880, "step": 301000 }, { "epoch": 5.928215628753126, "grad_norm": 2.3167307376861572, "learning_rate": 6.616518876238616e-07, "loss": 1.9711, "num_input_tokens_seen": 306585752, "step": 301100 }, { "epoch": 5.930184481502629, "grad_norm": 1.7533416748046875, "learning_rate": 6.614567662763231e-07, "loss": 1.9962, "num_input_tokens_seen": 306688152, "step": 301200 }, { "epoch": 5.932153334252131, "grad_norm": 1.8543781042099, "learning_rate": 6.612616174744231e-07, "loss": 1.9595, "num_input_tokens_seen": 306790552, "step": 301300 }, { "epoch": 5.934122187001634, "grad_norm": 2.121992826461792, "learning_rate": 6.610664412513452e-07, "loss": 1.9946, "num_input_tokens_seen": 306891424, "step": 301400 }, { "epoch": 5.936091039751137, "grad_norm": 1.914199709892273, "learning_rate": 6.608712376402777e-07, "loss": 1.9761, "num_input_tokens_seen": 306993384, "step": 301500 }, { "epoch": 5.93805989250064, "grad_norm": 1.9082857370376587, "learning_rate": 6.606760066744129e-07, "loss": 2.0071, "num_input_tokens_seen": 307094568, "step": 301600 }, { "epoch": 5.940028745250142, "grad_norm": 2.240058660507202, "learning_rate": 6.604807483869485e-07, "loss": 2.0056, "num_input_tokens_seen": 307196744, "step": 301700 }, { "epoch": 5.941997597999645, "grad_norm": 1.9677985906600952, "learning_rate": 6.602854628110863e-07, "loss": 2.002, "num_input_tokens_seen": 307298552, "step": 301800 }, { "epoch": 5.943966450749149, "grad_norm": 2.0020642280578613, "learning_rate": 6.600901499800331e-07, "loss": 2.0117, "num_input_tokens_seen": 307400952, "step": 301900 }, { "epoch": 5.9459353034986515, "grad_norm": 1.9213149547576904, "learning_rate": 6.598948099270001e-07, "loss": 1.9945, "num_input_tokens_seen": 307501816, "step": 302000 }, { "epoch": 5.947904156248154, "grad_norm": 1.8820880651474, "learning_rate": 6.596994426852034e-07, "loss": 2.0174, "num_input_tokens_seen": 307604216, "step": 302100 }, { "epoch": 5.949873008997657, "grad_norm": 1.952135682106018, "learning_rate": 6.595040482878634e-07, "loss": 1.9804, "num_input_tokens_seen": 307706408, "step": 302200 }, { "epoch": 5.95184186174716, "grad_norm": 1.8562573194503784, "learning_rate": 6.593086267682052e-07, "loss": 1.9871, "num_input_tokens_seen": 307808168, "step": 302300 }, { "epoch": 5.9538107144966625, "grad_norm": 2.650224208831787, "learning_rate": 6.591131781594587e-07, "loss": 2.0571, "num_input_tokens_seen": 307907840, "step": 302400 }, { "epoch": 5.955779567246165, "grad_norm": 1.6823968887329102, "learning_rate": 6.589177024948584e-07, "loss": 1.982, "num_input_tokens_seen": 308010240, "step": 302500 }, { "epoch": 5.957748419995669, "grad_norm": 2.0597550868988037, "learning_rate": 6.587221998076429e-07, "loss": 1.97, "num_input_tokens_seen": 308112640, "step": 302600 }, { "epoch": 5.959717272745172, "grad_norm": 7.394330978393555, "learning_rate": 6.585266701310563e-07, "loss": 1.9622, "num_input_tokens_seen": 308213864, "step": 302700 }, { "epoch": 5.9616861254946745, "grad_norm": 2.0766587257385254, "learning_rate": 6.583311134983464e-07, "loss": 1.9822, "num_input_tokens_seen": 308315672, "step": 302800 }, { "epoch": 5.963654978244177, "grad_norm": 2.0015435218811035, "learning_rate": 6.581355299427662e-07, "loss": 1.9894, "num_input_tokens_seen": 308418072, "step": 302900 }, { "epoch": 5.96562383099368, "grad_norm": 1.8748551607131958, "learning_rate": 6.579399194975729e-07, "loss": 1.9726, "num_input_tokens_seen": 308520472, "step": 303000 }, { "epoch": 5.967592683743183, "grad_norm": 1.9935256242752075, "learning_rate": 6.577442821960284e-07, "loss": 2.0136, "num_input_tokens_seen": 308622208, "step": 303100 }, { "epoch": 5.9695615364926855, "grad_norm": 1.722451090812683, "learning_rate": 6.575486180713992e-07, "loss": 2.0166, "num_input_tokens_seen": 308723224, "step": 303200 }, { "epoch": 5.971530389242188, "grad_norm": 1.9466898441314697, "learning_rate": 6.573529271569562e-07, "loss": 2.0004, "num_input_tokens_seen": 308824872, "step": 303300 }, { "epoch": 5.973499241991691, "grad_norm": 1.8579930067062378, "learning_rate": 6.571572094859752e-07, "loss": 1.9879, "num_input_tokens_seen": 308926504, "step": 303400 }, { "epoch": 5.975468094741195, "grad_norm": 1.9859167337417603, "learning_rate": 6.569614650917362e-07, "loss": 1.9813, "num_input_tokens_seen": 309027640, "step": 303500 }, { "epoch": 5.977436947490697, "grad_norm": 1.9646979570388794, "learning_rate": 6.56765694007524e-07, "loss": 2.0483, "num_input_tokens_seen": 309129488, "step": 303600 }, { "epoch": 5.9794058002402, "grad_norm": 2.0104897022247314, "learning_rate": 6.565698962666277e-07, "loss": 1.9652, "num_input_tokens_seen": 309231336, "step": 303700 }, { "epoch": 5.981374652989703, "grad_norm": 1.8535076379776, "learning_rate": 6.563740719023412e-07, "loss": 2.0091, "num_input_tokens_seen": 309333736, "step": 303800 }, { "epoch": 5.983343505739206, "grad_norm": 2.1762800216674805, "learning_rate": 6.561782209479626e-07, "loss": 1.9728, "num_input_tokens_seen": 309436136, "step": 303900 }, { "epoch": 5.9853123584887085, "grad_norm": 1.9892023801803589, "learning_rate": 6.559823434367948e-07, "loss": 2.0493, "num_input_tokens_seen": 309538312, "step": 304000 }, { "epoch": 5.987281211238211, "grad_norm": 2.135478973388672, "learning_rate": 6.557864394021453e-07, "loss": 1.9785, "num_input_tokens_seen": 309640232, "step": 304100 }, { "epoch": 5.989250063987714, "grad_norm": 1.9356056451797485, "learning_rate": 6.555905088773255e-07, "loss": 1.9847, "num_input_tokens_seen": 309741984, "step": 304200 }, { "epoch": 5.991218916737218, "grad_norm": 1.9058266878128052, "learning_rate": 6.55394551895652e-07, "loss": 1.9464, "num_input_tokens_seen": 309843072, "step": 304300 }, { "epoch": 5.99318776948672, "grad_norm": 1.9438482522964478, "learning_rate": 6.551985684904457e-07, "loss": 1.9916, "num_input_tokens_seen": 309943736, "step": 304400 }, { "epoch": 5.995156622236223, "grad_norm": 2.3376619815826416, "learning_rate": 6.550025586950319e-07, "loss": 2.0215, "num_input_tokens_seen": 310046136, "step": 304500 }, { "epoch": 5.997125474985726, "grad_norm": 1.8232134580612183, "learning_rate": 6.548065225427402e-07, "loss": 1.9955, "num_input_tokens_seen": 310148536, "step": 304600 }, { "epoch": 5.999094327735229, "grad_norm": 2.128138303756714, "learning_rate": 6.546104600669051e-07, "loss": 2.0323, "num_input_tokens_seen": 310249568, "step": 304700 }, { "epoch": 6.001063180484731, "grad_norm": 1.7412737607955933, "learning_rate": 6.544143713008652e-07, "loss": 1.9671, "num_input_tokens_seen": 310351192, "step": 304800 }, { "epoch": 6.003032033234234, "grad_norm": 1.8386776447296143, "learning_rate": 6.542182562779641e-07, "loss": 1.9604, "num_input_tokens_seen": 310452128, "step": 304900 }, { "epoch": 6.005000885983737, "grad_norm": 2.1488161087036133, "learning_rate": 6.54022115031549e-07, "loss": 2.0128, "num_input_tokens_seen": 310552416, "step": 305000 }, { "epoch": 6.006969738733241, "grad_norm": 1.9873698949813843, "learning_rate": 6.538259475949726e-07, "loss": 1.9888, "num_input_tokens_seen": 310654816, "step": 305100 }, { "epoch": 6.008938591482743, "grad_norm": 2.0063347816467285, "learning_rate": 6.53629754001591e-07, "loss": 1.9771, "num_input_tokens_seen": 310757216, "step": 305200 }, { "epoch": 6.010907444232246, "grad_norm": 1.8294849395751953, "learning_rate": 6.534335342847657e-07, "loss": 2.0169, "num_input_tokens_seen": 310856768, "step": 305300 }, { "epoch": 6.012876296981749, "grad_norm": 1.804033875465393, "learning_rate": 6.532372884778618e-07, "loss": 1.9687, "num_input_tokens_seen": 310959168, "step": 305400 }, { "epoch": 6.014845149731252, "grad_norm": 1.8964186906814575, "learning_rate": 6.530410166142495e-07, "loss": 1.9883, "num_input_tokens_seen": 311060792, "step": 305500 }, { "epoch": 6.016814002480754, "grad_norm": 1.8022997379302979, "learning_rate": 6.528447187273032e-07, "loss": 1.9433, "num_input_tokens_seen": 311163192, "step": 305600 }, { "epoch": 6.018782855230257, "grad_norm": 2.142498254776001, "learning_rate": 6.526483948504016e-07, "loss": 1.964, "num_input_tokens_seen": 311263936, "step": 305700 }, { "epoch": 6.02075170797976, "grad_norm": 1.9424124956130981, "learning_rate": 6.52452045016928e-07, "loss": 2.0609, "num_input_tokens_seen": 311364776, "step": 305800 }, { "epoch": 6.022720560729263, "grad_norm": 1.9188220500946045, "learning_rate": 6.522556692602698e-07, "loss": 2.0215, "num_input_tokens_seen": 311465584, "step": 305900 }, { "epoch": 6.024689413478766, "grad_norm": 1.7720577716827393, "learning_rate": 6.520592676138196e-07, "loss": 1.9621, "num_input_tokens_seen": 311567984, "step": 306000 }, { "epoch": 6.026658266228269, "grad_norm": 1.8154102563858032, "learning_rate": 6.518628401109731e-07, "loss": 1.998, "num_input_tokens_seen": 311669728, "step": 306100 }, { "epoch": 6.028627118977772, "grad_norm": 2.2377007007598877, "learning_rate": 6.516663867851318e-07, "loss": 1.9819, "num_input_tokens_seen": 311770080, "step": 306200 }, { "epoch": 6.030595971727275, "grad_norm": 1.8667947053909302, "learning_rate": 6.514699076697005e-07, "loss": 1.9614, "num_input_tokens_seen": 311872480, "step": 306300 }, { "epoch": 6.032564824476777, "grad_norm": 1.9634737968444824, "learning_rate": 6.512734027980889e-07, "loss": 2.0079, "num_input_tokens_seen": 311974880, "step": 306400 }, { "epoch": 6.03453367722628, "grad_norm": 2.067988157272339, "learning_rate": 6.51076872203711e-07, "loss": 2.0224, "num_input_tokens_seen": 312076840, "step": 306500 }, { "epoch": 6.036502529975783, "grad_norm": 1.9597209692001343, "learning_rate": 6.508803159199854e-07, "loss": 1.962, "num_input_tokens_seen": 312179240, "step": 306600 }, { "epoch": 6.038471382725286, "grad_norm": 1.8099509477615356, "learning_rate": 6.506837339803346e-07, "loss": 1.9511, "num_input_tokens_seen": 312281640, "step": 306700 }, { "epoch": 6.040440235474789, "grad_norm": 2.1230101585388184, "learning_rate": 6.504871264181857e-07, "loss": 2.016, "num_input_tokens_seen": 312384040, "step": 306800 }, { "epoch": 6.042409088224292, "grad_norm": 1.8817248344421387, "learning_rate": 6.502904932669702e-07, "loss": 2.0034, "num_input_tokens_seen": 312484688, "step": 306900 }, { "epoch": 6.044377940973795, "grad_norm": 1.9372673034667969, "learning_rate": 6.500938345601242e-07, "loss": 2.0577, "num_input_tokens_seen": 312586792, "step": 307000 }, { "epoch": 6.0463467937232975, "grad_norm": 1.7838410139083862, "learning_rate": 6.498971503310872e-07, "loss": 1.9655, "num_input_tokens_seen": 312688880, "step": 307100 }, { "epoch": 6.0483156464728, "grad_norm": 2.113298177719116, "learning_rate": 6.497004406133044e-07, "loss": 2.0385, "num_input_tokens_seen": 312790280, "step": 307200 }, { "epoch": 6.050284499222303, "grad_norm": 1.816819190979004, "learning_rate": 6.495037054402242e-07, "loss": 1.9808, "num_input_tokens_seen": 312892680, "step": 307300 }, { "epoch": 6.052253351971806, "grad_norm": 1.963083028793335, "learning_rate": 6.493069448452998e-07, "loss": 1.9673, "num_input_tokens_seen": 312994440, "step": 307400 }, { "epoch": 6.054222204721309, "grad_norm": 1.9467344284057617, "learning_rate": 6.491101588619888e-07, "loss": 1.9874, "num_input_tokens_seen": 313096840, "step": 307500 }, { "epoch": 6.056191057470811, "grad_norm": 1.747800588607788, "learning_rate": 6.489133475237528e-07, "loss": 1.9606, "num_input_tokens_seen": 313198624, "step": 307600 }, { "epoch": 6.058159910220315, "grad_norm": 2.129979133605957, "learning_rate": 6.487165108640581e-07, "loss": 2.0203, "num_input_tokens_seen": 313300320, "step": 307700 }, { "epoch": 6.060128762969818, "grad_norm": 1.9538705348968506, "learning_rate": 6.485196489163752e-07, "loss": 2.0202, "num_input_tokens_seen": 313401664, "step": 307800 }, { "epoch": 6.0620976157193205, "grad_norm": 2.107147216796875, "learning_rate": 6.483227617141786e-07, "loss": 1.9897, "num_input_tokens_seen": 313504064, "step": 307900 }, { "epoch": 6.064066468468823, "grad_norm": 2.035672903060913, "learning_rate": 6.481258492909474e-07, "loss": 2.024, "num_input_tokens_seen": 313604280, "step": 308000 }, { "epoch": 6.066035321218326, "grad_norm": 1.7072129249572754, "learning_rate": 6.479289116801648e-07, "loss": 2.0033, "num_input_tokens_seen": 313705928, "step": 308100 }, { "epoch": 6.068004173967829, "grad_norm": 1.9450758695602417, "learning_rate": 6.477319489153185e-07, "loss": 1.9879, "num_input_tokens_seen": 313808328, "step": 308200 }, { "epoch": 6.0699730267173315, "grad_norm": 2.076735019683838, "learning_rate": 6.475349610299004e-07, "loss": 1.9798, "num_input_tokens_seen": 313909912, "step": 308300 }, { "epoch": 6.071941879466834, "grad_norm": 1.7819191217422485, "learning_rate": 6.473379480574065e-07, "loss": 2.0272, "num_input_tokens_seen": 314011856, "step": 308400 }, { "epoch": 6.073910732216338, "grad_norm": 2.2660250663757324, "learning_rate": 6.471409100313372e-07, "loss": 2.0186, "num_input_tokens_seen": 314114256, "step": 308500 }, { "epoch": 6.075879584965841, "grad_norm": 1.8634023666381836, "learning_rate": 6.469438469851971e-07, "loss": 2.01, "num_input_tokens_seen": 314215208, "step": 308600 }, { "epoch": 6.077848437715343, "grad_norm": 1.7328294515609741, "learning_rate": 6.467467589524954e-07, "loss": 1.9789, "num_input_tokens_seen": 314317608, "step": 308700 }, { "epoch": 6.079817290464846, "grad_norm": 2.0241761207580566, "learning_rate": 6.465496459667449e-07, "loss": 2.0178, "num_input_tokens_seen": 314418184, "step": 308800 }, { "epoch": 6.081786143214349, "grad_norm": 1.7773127555847168, "learning_rate": 6.463525080614631e-07, "loss": 2.0023, "num_input_tokens_seen": 314520584, "step": 308900 }, { "epoch": 6.083754995963852, "grad_norm": 2.274423122406006, "learning_rate": 6.46155345270172e-07, "loss": 1.9614, "num_input_tokens_seen": 314622984, "step": 309000 }, { "epoch": 6.0857238487133545, "grad_norm": 2.167804479598999, "learning_rate": 6.459581576263969e-07, "loss": 2.0152, "num_input_tokens_seen": 314724968, "step": 309100 }, { "epoch": 6.087692701462857, "grad_norm": 1.6423709392547607, "learning_rate": 6.457609451636683e-07, "loss": 2.0155, "num_input_tokens_seen": 314827368, "step": 309200 }, { "epoch": 6.089661554212361, "grad_norm": 1.932984709739685, "learning_rate": 6.455637079155203e-07, "loss": 1.9914, "num_input_tokens_seen": 314929080, "step": 309300 }, { "epoch": 6.091630406961864, "grad_norm": 2.155951976776123, "learning_rate": 6.453664459154916e-07, "loss": 1.9567, "num_input_tokens_seen": 315031480, "step": 309400 }, { "epoch": 6.093599259711366, "grad_norm": 1.9752286672592163, "learning_rate": 6.451691591971248e-07, "loss": 1.9582, "num_input_tokens_seen": 315133880, "step": 309500 }, { "epoch": 6.095568112460869, "grad_norm": 1.9401935338974, "learning_rate": 6.449718477939669e-07, "loss": 1.9556, "num_input_tokens_seen": 315236280, "step": 309600 }, { "epoch": 6.097536965210372, "grad_norm": 1.7289832830429077, "learning_rate": 6.44774511739569e-07, "loss": 1.9876, "num_input_tokens_seen": 315336760, "step": 309700 }, { "epoch": 6.099505817959875, "grad_norm": 1.9301186800003052, "learning_rate": 6.445771510674865e-07, "loss": 1.988, "num_input_tokens_seen": 315439160, "step": 309800 }, { "epoch": 6.101474670709377, "grad_norm": 1.912955641746521, "learning_rate": 6.443797658112788e-07, "loss": 1.9711, "num_input_tokens_seen": 315541112, "step": 309900 }, { "epoch": 6.10344352345888, "grad_norm": 1.7604267597198486, "learning_rate": 6.441823560045098e-07, "loss": 1.9465, "num_input_tokens_seen": 315642496, "step": 310000 }, { "epoch": 6.105412376208383, "grad_norm": 1.8021234273910522, "learning_rate": 6.439849216807472e-07, "loss": 1.9585, "num_input_tokens_seen": 315744896, "step": 310100 }, { "epoch": 6.107381228957887, "grad_norm": 2.0396640300750732, "learning_rate": 6.437874628735631e-07, "loss": 2.0367, "num_input_tokens_seen": 315846560, "step": 310200 }, { "epoch": 6.109350081707389, "grad_norm": 1.698716163635254, "learning_rate": 6.435899796165337e-07, "loss": 1.9712, "num_input_tokens_seen": 315948960, "step": 310300 }, { "epoch": 6.111318934456892, "grad_norm": 1.9908227920532227, "learning_rate": 6.433924719432393e-07, "loss": 1.9535, "num_input_tokens_seen": 316050872, "step": 310400 }, { "epoch": 6.113287787206395, "grad_norm": 1.7665883302688599, "learning_rate": 6.431949398872646e-07, "loss": 1.9809, "num_input_tokens_seen": 316151576, "step": 310500 }, { "epoch": 6.115256639955898, "grad_norm": 1.7452491521835327, "learning_rate": 6.429973834821979e-07, "loss": 2.0128, "num_input_tokens_seen": 316253384, "step": 310600 }, { "epoch": 6.1172254927054, "grad_norm": 2.101358652114868, "learning_rate": 6.427998027616324e-07, "loss": 1.9806, "num_input_tokens_seen": 316355784, "step": 310700 }, { "epoch": 6.119194345454903, "grad_norm": 1.9029123783111572, "learning_rate": 6.426021977591648e-07, "loss": 1.9777, "num_input_tokens_seen": 316456696, "step": 310800 }, { "epoch": 6.121163198204406, "grad_norm": 1.7739269733428955, "learning_rate": 6.424045685083963e-07, "loss": 1.9684, "num_input_tokens_seen": 316558008, "step": 310900 }, { "epoch": 6.1231320509539096, "grad_norm": 2.0812501907348633, "learning_rate": 6.42206915042932e-07, "loss": 2.0026, "num_input_tokens_seen": 316659848, "step": 311000 }, { "epoch": 6.125100903703412, "grad_norm": 1.91690194606781, "learning_rate": 6.420092373963814e-07, "loss": 1.9969, "num_input_tokens_seen": 316761712, "step": 311100 }, { "epoch": 6.127069756452915, "grad_norm": 1.6721659898757935, "learning_rate": 6.418115356023575e-07, "loss": 1.9864, "num_input_tokens_seen": 316864112, "step": 311200 }, { "epoch": 6.129038609202418, "grad_norm": 4.432291507720947, "learning_rate": 6.416138096944782e-07, "loss": 2.0291, "num_input_tokens_seen": 316965184, "step": 311300 }, { "epoch": 6.131007461951921, "grad_norm": 1.8571569919586182, "learning_rate": 6.414160597063648e-07, "loss": 2.0362, "num_input_tokens_seen": 317067584, "step": 311400 }, { "epoch": 6.132976314701423, "grad_norm": 2.183241128921509, "learning_rate": 6.412182856716435e-07, "loss": 2.0357, "num_input_tokens_seen": 317169984, "step": 311500 }, { "epoch": 6.134945167450926, "grad_norm": 1.953399658203125, "learning_rate": 6.410204876239435e-07, "loss": 1.9955, "num_input_tokens_seen": 317272384, "step": 311600 }, { "epoch": 6.136914020200429, "grad_norm": 1.638275384902954, "learning_rate": 6.408226655968992e-07, "loss": 1.9822, "num_input_tokens_seen": 317373952, "step": 311700 }, { "epoch": 6.138882872949932, "grad_norm": 1.8970870971679688, "learning_rate": 6.406248196241482e-07, "loss": 1.9544, "num_input_tokens_seen": 317476352, "step": 311800 }, { "epoch": 6.140851725699435, "grad_norm": 2.006563186645508, "learning_rate": 6.404269497393329e-07, "loss": 1.9694, "num_input_tokens_seen": 317578752, "step": 311900 }, { "epoch": 6.142820578448938, "grad_norm": 2.508770704269409, "learning_rate": 6.402290559760991e-07, "loss": 1.9487, "num_input_tokens_seen": 317681152, "step": 312000 }, { "epoch": 6.144789431198441, "grad_norm": 1.7859289646148682, "learning_rate": 6.400311383680972e-07, "loss": 2.0625, "num_input_tokens_seen": 317782736, "step": 312100 }, { "epoch": 6.1467582839479435, "grad_norm": 1.9423662424087524, "learning_rate": 6.398331969489812e-07, "loss": 2.0034, "num_input_tokens_seen": 317882104, "step": 312200 }, { "epoch": 6.148727136697446, "grad_norm": 1.9616044759750366, "learning_rate": 6.396352317524094e-07, "loss": 2.0182, "num_input_tokens_seen": 317983328, "step": 312300 }, { "epoch": 6.150695989446949, "grad_norm": 1.9331483840942383, "learning_rate": 6.394372428120442e-07, "loss": 2.0063, "num_input_tokens_seen": 318085728, "step": 312400 }, { "epoch": 6.152664842196452, "grad_norm": 2.319206953048706, "learning_rate": 6.392392301615521e-07, "loss": 1.9624, "num_input_tokens_seen": 318187600, "step": 312500 }, { "epoch": 6.154633694945955, "grad_norm": 2.0272469520568848, "learning_rate": 6.390411938346032e-07, "loss": 1.9519, "num_input_tokens_seen": 318289240, "step": 312600 }, { "epoch": 6.156602547695458, "grad_norm": 5.838209629058838, "learning_rate": 6.388431338648719e-07, "loss": 1.9764, "num_input_tokens_seen": 318390368, "step": 312700 }, { "epoch": 6.158571400444961, "grad_norm": 1.9024221897125244, "learning_rate": 6.386450502860369e-07, "loss": 2.0118, "num_input_tokens_seen": 318491344, "step": 312800 }, { "epoch": 6.160540253194464, "grad_norm": 2.111111879348755, "learning_rate": 6.384469431317805e-07, "loss": 2.008, "num_input_tokens_seen": 318593744, "step": 312900 }, { "epoch": 6.1625091059439665, "grad_norm": 1.8804786205291748, "learning_rate": 6.382488124357889e-07, "loss": 1.9992, "num_input_tokens_seen": 318696144, "step": 313000 }, { "epoch": 6.164477958693469, "grad_norm": 1.8802192211151123, "learning_rate": 6.380506582317531e-07, "loss": 1.9387, "num_input_tokens_seen": 318798544, "step": 313100 }, { "epoch": 6.166446811442972, "grad_norm": 2.6956980228424072, "learning_rate": 6.37852480553367e-07, "loss": 1.9605, "num_input_tokens_seen": 318900944, "step": 313200 }, { "epoch": 6.168415664192475, "grad_norm": 2.159541606903076, "learning_rate": 6.376542794343295e-07, "loss": 1.9499, "num_input_tokens_seen": 319001936, "step": 313300 }, { "epoch": 6.1703845169419775, "grad_norm": 1.9137659072875977, "learning_rate": 6.374560549083426e-07, "loss": 1.9951, "num_input_tokens_seen": 319103568, "step": 313400 }, { "epoch": 6.172353369691481, "grad_norm": 1.8772330284118652, "learning_rate": 6.372578070091131e-07, "loss": 1.9354, "num_input_tokens_seen": 319205968, "step": 313500 }, { "epoch": 6.174322222440984, "grad_norm": 2.034302234649658, "learning_rate": 6.370595357703511e-07, "loss": 2.0053, "num_input_tokens_seen": 319308368, "step": 313600 }, { "epoch": 6.176291075190487, "grad_norm": 2.2465462684631348, "learning_rate": 6.36861241225771e-07, "loss": 1.9863, "num_input_tokens_seen": 319410128, "step": 313700 }, { "epoch": 6.1782599279399895, "grad_norm": 1.771889090538025, "learning_rate": 6.366629234090913e-07, "loss": 1.9867, "num_input_tokens_seen": 319512528, "step": 313800 }, { "epoch": 6.180228780689492, "grad_norm": 1.7891899347305298, "learning_rate": 6.364645823540341e-07, "loss": 1.9278, "num_input_tokens_seen": 319614928, "step": 313900 }, { "epoch": 6.182197633438995, "grad_norm": 1.8706449270248413, "learning_rate": 6.362662180943257e-07, "loss": 2.0377, "num_input_tokens_seen": 319716376, "step": 314000 }, { "epoch": 6.184166486188498, "grad_norm": 1.7015857696533203, "learning_rate": 6.360678306636962e-07, "loss": 1.9887, "num_input_tokens_seen": 319817640, "step": 314100 }, { "epoch": 6.1861353389380005, "grad_norm": 1.8962303400039673, "learning_rate": 6.358694200958797e-07, "loss": 2.0208, "num_input_tokens_seen": 319919264, "step": 314200 }, { "epoch": 6.188104191687504, "grad_norm": 2.0260705947875977, "learning_rate": 6.356709864246143e-07, "loss": 1.9916, "num_input_tokens_seen": 320021568, "step": 314300 }, { "epoch": 6.190073044437007, "grad_norm": 2.3741517066955566, "learning_rate": 6.354725296836421e-07, "loss": 2.0141, "num_input_tokens_seen": 320123920, "step": 314400 }, { "epoch": 6.19204189718651, "grad_norm": 2.0920419692993164, "learning_rate": 6.352740499067086e-07, "loss": 2.0139, "num_input_tokens_seen": 320226024, "step": 314500 }, { "epoch": 6.194010749936012, "grad_norm": 1.7310185432434082, "learning_rate": 6.350755471275642e-07, "loss": 1.9685, "num_input_tokens_seen": 320328424, "step": 314600 }, { "epoch": 6.195979602685515, "grad_norm": 1.8541840314865112, "learning_rate": 6.34877021379962e-07, "loss": 1.966, "num_input_tokens_seen": 320430824, "step": 314700 }, { "epoch": 6.197948455435018, "grad_norm": 1.7966785430908203, "learning_rate": 6.346784726976599e-07, "loss": 1.9815, "num_input_tokens_seen": 320533224, "step": 314800 }, { "epoch": 6.199917308184521, "grad_norm": 1.8406530618667603, "learning_rate": 6.344799011144196e-07, "loss": 2.0185, "num_input_tokens_seen": 320634864, "step": 314900 }, { "epoch": 6.2018861609340235, "grad_norm": 1.9197044372558594, "learning_rate": 6.342813066640063e-07, "loss": 2.0046, "num_input_tokens_seen": 320736832, "step": 315000 }, { "epoch": 6.203855013683526, "grad_norm": 1.9414684772491455, "learning_rate": 6.340826893801895e-07, "loss": 1.9641, "num_input_tokens_seen": 320839232, "step": 315100 }, { "epoch": 6.20582386643303, "grad_norm": 1.6983762979507446, "learning_rate": 6.338840492967422e-07, "loss": 1.9799, "num_input_tokens_seen": 320940808, "step": 315200 }, { "epoch": 6.207792719182533, "grad_norm": 2.312974214553833, "learning_rate": 6.336853864474416e-07, "loss": 1.9989, "num_input_tokens_seen": 321043208, "step": 315300 }, { "epoch": 6.209761571932035, "grad_norm": 1.7660846710205078, "learning_rate": 6.334867008660685e-07, "loss": 1.9933, "num_input_tokens_seen": 321145608, "step": 315400 }, { "epoch": 6.211730424681538, "grad_norm": 2.285482406616211, "learning_rate": 6.332879925864079e-07, "loss": 1.9407, "num_input_tokens_seen": 321248008, "step": 315500 }, { "epoch": 6.213699277431041, "grad_norm": 2.0247650146484375, "learning_rate": 6.330892616422483e-07, "loss": 2.0361, "num_input_tokens_seen": 321350408, "step": 315600 }, { "epoch": 6.215668130180544, "grad_norm": 1.7272166013717651, "learning_rate": 6.328905080673821e-07, "loss": 2.0177, "num_input_tokens_seen": 321451936, "step": 315700 }, { "epoch": 6.217636982930046, "grad_norm": 1.9985069036483765, "learning_rate": 6.326917318956061e-07, "loss": 2.0181, "num_input_tokens_seen": 321554336, "step": 315800 }, { "epoch": 6.219605835679549, "grad_norm": 1.9732002019882202, "learning_rate": 6.324929331607201e-07, "loss": 1.9909, "num_input_tokens_seen": 321655032, "step": 315900 }, { "epoch": 6.221574688429053, "grad_norm": 2.13789439201355, "learning_rate": 6.322941118965285e-07, "loss": 1.9734, "num_input_tokens_seen": 321757432, "step": 316000 }, { "epoch": 6.223543541178556, "grad_norm": 1.8527015447616577, "learning_rate": 6.320952681368389e-07, "loss": 1.9818, "num_input_tokens_seen": 321859072, "step": 316100 }, { "epoch": 6.225512393928058, "grad_norm": 1.8170995712280273, "learning_rate": 6.318964019154632e-07, "loss": 1.9601, "num_input_tokens_seen": 321960168, "step": 316200 }, { "epoch": 6.227481246677561, "grad_norm": 2.16269850730896, "learning_rate": 6.316975132662165e-07, "loss": 2.0435, "num_input_tokens_seen": 322060616, "step": 316300 }, { "epoch": 6.229450099427064, "grad_norm": 1.6585129499435425, "learning_rate": 6.314986022229189e-07, "loss": 1.9044, "num_input_tokens_seen": 322162184, "step": 316400 }, { "epoch": 6.231418952176567, "grad_norm": 1.984542965888977, "learning_rate": 6.312996688193927e-07, "loss": 1.993, "num_input_tokens_seen": 322263808, "step": 316500 }, { "epoch": 6.233387804926069, "grad_norm": 1.7414846420288086, "learning_rate": 6.311007130894656e-07, "loss": 1.9723, "num_input_tokens_seen": 322366208, "step": 316600 }, { "epoch": 6.235356657675572, "grad_norm": 1.8732280731201172, "learning_rate": 6.309017350669677e-07, "loss": 1.9883, "num_input_tokens_seen": 322467584, "step": 316700 }, { "epoch": 6.237325510425075, "grad_norm": 1.8613649606704712, "learning_rate": 6.307027347857338e-07, "loss": 1.9783, "num_input_tokens_seen": 322569432, "step": 316800 }, { "epoch": 6.2392943631745785, "grad_norm": 1.6686112880706787, "learning_rate": 6.305037122796027e-07, "loss": 1.9398, "num_input_tokens_seen": 322671832, "step": 316900 }, { "epoch": 6.241263215924081, "grad_norm": 2.0981709957122803, "learning_rate": 6.303046675824156e-07, "loss": 2.0154, "num_input_tokens_seen": 322774232, "step": 317000 }, { "epoch": 6.243232068673584, "grad_norm": 1.8230525255203247, "learning_rate": 6.301056007280189e-07, "loss": 1.956, "num_input_tokens_seen": 322876632, "step": 317100 }, { "epoch": 6.245200921423087, "grad_norm": 2.0213985443115234, "learning_rate": 6.299065117502623e-07, "loss": 1.9618, "num_input_tokens_seen": 322979032, "step": 317200 }, { "epoch": 6.24716977417259, "grad_norm": 1.8146755695343018, "learning_rate": 6.29707400682999e-07, "loss": 2.0166, "num_input_tokens_seen": 323079712, "step": 317300 }, { "epoch": 6.249138626922092, "grad_norm": 1.784530520439148, "learning_rate": 6.295082675600862e-07, "loss": 1.9586, "num_input_tokens_seen": 323182112, "step": 317400 }, { "epoch": 6.251107479671595, "grad_norm": 1.963919997215271, "learning_rate": 6.293091124153847e-07, "loss": 1.9939, "num_input_tokens_seen": 323283144, "step": 317500 }, { "epoch": 6.253076332421098, "grad_norm": 1.6982088088989258, "learning_rate": 6.291099352827596e-07, "loss": 2.0027, "num_input_tokens_seen": 323385544, "step": 317600 }, { "epoch": 6.2550451851706015, "grad_norm": 1.648306965827942, "learning_rate": 6.289107361960786e-07, "loss": 1.9662, "num_input_tokens_seen": 323487352, "step": 317700 }, { "epoch": 6.257014037920104, "grad_norm": 1.9238020181655884, "learning_rate": 6.287115151892144e-07, "loss": 2.022, "num_input_tokens_seen": 323588072, "step": 317800 }, { "epoch": 6.258982890669607, "grad_norm": 2.021799325942993, "learning_rate": 6.285122722960426e-07, "loss": 1.9903, "num_input_tokens_seen": 323689392, "step": 317900 }, { "epoch": 6.26095174341911, "grad_norm": 1.8029745817184448, "learning_rate": 6.283130075504428e-07, "loss": 2.026, "num_input_tokens_seen": 323791560, "step": 318000 }, { "epoch": 6.2629205961686125, "grad_norm": 1.8561002016067505, "learning_rate": 6.281137209862984e-07, "loss": 1.984, "num_input_tokens_seen": 323893424, "step": 318100 }, { "epoch": 6.264889448918115, "grad_norm": 1.7570619583129883, "learning_rate": 6.279144126374962e-07, "loss": 1.9701, "num_input_tokens_seen": 323995824, "step": 318200 }, { "epoch": 6.266858301667618, "grad_norm": 2.216931104660034, "learning_rate": 6.27715082537927e-07, "loss": 1.9989, "num_input_tokens_seen": 324096224, "step": 318300 }, { "epoch": 6.268827154417121, "grad_norm": 1.9495714902877808, "learning_rate": 6.275157307214853e-07, "loss": 1.9546, "num_input_tokens_seen": 324196616, "step": 318400 }, { "epoch": 6.2707960071666236, "grad_norm": 1.943924069404602, "learning_rate": 6.273163572220691e-07, "loss": 1.9438, "num_input_tokens_seen": 324299016, "step": 318500 }, { "epoch": 6.272764859916127, "grad_norm": 2.0724143981933594, "learning_rate": 6.271169620735801e-07, "loss": 1.9761, "num_input_tokens_seen": 324401416, "step": 318600 }, { "epoch": 6.27473371266563, "grad_norm": 1.738439917564392, "learning_rate": 6.269175453099241e-07, "loss": 2.0024, "num_input_tokens_seen": 324503136, "step": 318700 }, { "epoch": 6.276702565415133, "grad_norm": 1.8139582872390747, "learning_rate": 6.267181069650096e-07, "loss": 1.9696, "num_input_tokens_seen": 324605536, "step": 318800 }, { "epoch": 6.2786714181646355, "grad_norm": 2.2022266387939453, "learning_rate": 6.2651864707275e-07, "loss": 1.9885, "num_input_tokens_seen": 324707936, "step": 318900 }, { "epoch": 6.280640270914138, "grad_norm": 1.9122960567474365, "learning_rate": 6.263191656670615e-07, "loss": 1.9958, "num_input_tokens_seen": 324810336, "step": 319000 }, { "epoch": 6.282609123663641, "grad_norm": 1.9304590225219727, "learning_rate": 6.261196627818646e-07, "loss": 1.9889, "num_input_tokens_seen": 324912024, "step": 319100 }, { "epoch": 6.284577976413144, "grad_norm": 1.790189266204834, "learning_rate": 6.259201384510824e-07, "loss": 1.9664, "num_input_tokens_seen": 325013592, "step": 319200 }, { "epoch": 6.2865468291626465, "grad_norm": 2.089468002319336, "learning_rate": 6.25720592708643e-07, "loss": 1.9934, "num_input_tokens_seen": 325115456, "step": 319300 }, { "epoch": 6.28851568191215, "grad_norm": 2.296259880065918, "learning_rate": 6.25521025588477e-07, "loss": 2.1073, "num_input_tokens_seen": 325216088, "step": 319400 }, { "epoch": 6.290484534661653, "grad_norm": 2.262141466140747, "learning_rate": 6.253214371245195e-07, "loss": 2.0009, "num_input_tokens_seen": 325318488, "step": 319500 }, { "epoch": 6.292453387411156, "grad_norm": 2.2707834243774414, "learning_rate": 6.251218273507084e-07, "loss": 2.0127, "num_input_tokens_seen": 325420888, "step": 319600 }, { "epoch": 6.294422240160658, "grad_norm": 1.8361297845840454, "learning_rate": 6.249221963009859e-07, "loss": 2.0115, "num_input_tokens_seen": 325522520, "step": 319700 }, { "epoch": 6.296391092910161, "grad_norm": 1.8942452669143677, "learning_rate": 6.247225440092976e-07, "loss": 1.9458, "num_input_tokens_seen": 325624920, "step": 319800 }, { "epoch": 6.298359945659664, "grad_norm": 1.986633539199829, "learning_rate": 6.245228705095928e-07, "loss": 1.9764, "num_input_tokens_seen": 325725856, "step": 319900 }, { "epoch": 6.300328798409167, "grad_norm": 2.036482810974121, "learning_rate": 6.243231758358239e-07, "loss": 1.9396, "num_input_tokens_seen": 325828256, "step": 320000 }, { "epoch": 6.3022976511586695, "grad_norm": 1.6657260656356812, "learning_rate": 6.241234600219477e-07, "loss": 1.983, "num_input_tokens_seen": 325930656, "step": 320100 }, { "epoch": 6.304266503908173, "grad_norm": 1.9341163635253906, "learning_rate": 6.23923723101924e-07, "loss": 1.9626, "num_input_tokens_seen": 326033056, "step": 320200 }, { "epoch": 6.306235356657676, "grad_norm": 1.9460424184799194, "learning_rate": 6.237239651097165e-07, "loss": 1.9853, "num_input_tokens_seen": 326135456, "step": 320300 }, { "epoch": 6.308204209407179, "grad_norm": 1.927511215209961, "learning_rate": 6.235241860792921e-07, "loss": 1.9929, "num_input_tokens_seen": 326237272, "step": 320400 }, { "epoch": 6.310173062156681, "grad_norm": 2.020237922668457, "learning_rate": 6.233243860446218e-07, "loss": 1.951, "num_input_tokens_seen": 326339672, "step": 320500 }, { "epoch": 6.312141914906184, "grad_norm": 1.974599838256836, "learning_rate": 6.231245650396798e-07, "loss": 2.0165, "num_input_tokens_seen": 326441152, "step": 320600 }, { "epoch": 6.314110767655687, "grad_norm": 2.6221978664398193, "learning_rate": 6.22924723098444e-07, "loss": 1.9924, "num_input_tokens_seen": 326543376, "step": 320700 }, { "epoch": 6.31607962040519, "grad_norm": 1.7145941257476807, "learning_rate": 6.227248602548958e-07, "loss": 2.0027, "num_input_tokens_seen": 326645776, "step": 320800 }, { "epoch": 6.318048473154692, "grad_norm": 1.776474118232727, "learning_rate": 6.225249765430203e-07, "loss": 1.981, "num_input_tokens_seen": 326748176, "step": 320900 }, { "epoch": 6.320017325904196, "grad_norm": 1.6626194715499878, "learning_rate": 6.22325071996806e-07, "loss": 1.9753, "num_input_tokens_seen": 326850576, "step": 321000 }, { "epoch": 6.321986178653699, "grad_norm": 2.00114107131958, "learning_rate": 6.221251466502447e-07, "loss": 1.9666, "num_input_tokens_seen": 326952464, "step": 321100 }, { "epoch": 6.323955031403202, "grad_norm": 1.800858974456787, "learning_rate": 6.219252005373326e-07, "loss": 1.9883, "num_input_tokens_seen": 327054088, "step": 321200 }, { "epoch": 6.325923884152704, "grad_norm": 1.948839783668518, "learning_rate": 6.217252336920682e-07, "loss": 1.9541, "num_input_tokens_seen": 327155672, "step": 321300 }, { "epoch": 6.327892736902207, "grad_norm": 1.7164121866226196, "learning_rate": 6.215252461484546e-07, "loss": 2.0029, "num_input_tokens_seen": 327255728, "step": 321400 }, { "epoch": 6.32986158965171, "grad_norm": 2.1410481929779053, "learning_rate": 6.21325237940498e-07, "loss": 2.0022, "num_input_tokens_seen": 327357848, "step": 321500 }, { "epoch": 6.331830442401213, "grad_norm": 2.3601491451263428, "learning_rate": 6.211252091022078e-07, "loss": 1.9813, "num_input_tokens_seen": 327460040, "step": 321600 }, { "epoch": 6.333799295150715, "grad_norm": 2.4539542198181152, "learning_rate": 6.209251596675975e-07, "loss": 1.9681, "num_input_tokens_seen": 327561032, "step": 321700 }, { "epoch": 6.335768147900218, "grad_norm": 2.0476949214935303, "learning_rate": 6.207250896706838e-07, "loss": 1.9965, "num_input_tokens_seen": 327662536, "step": 321800 }, { "epoch": 6.337737000649722, "grad_norm": 3.0949981212615967, "learning_rate": 6.205249991454868e-07, "loss": 2.0221, "num_input_tokens_seen": 327764936, "step": 321900 }, { "epoch": 6.3397058533992245, "grad_norm": 1.8338130712509155, "learning_rate": 6.203248881260301e-07, "loss": 1.9776, "num_input_tokens_seen": 327865688, "step": 322000 }, { "epoch": 6.341674706148727, "grad_norm": 1.751347303390503, "learning_rate": 6.201247566463412e-07, "loss": 1.9853, "num_input_tokens_seen": 327966528, "step": 322100 }, { "epoch": 6.34364355889823, "grad_norm": 2.0570356845855713, "learning_rate": 6.199246047404506e-07, "loss": 1.9795, "num_input_tokens_seen": 328068064, "step": 322200 }, { "epoch": 6.345612411647733, "grad_norm": 1.7843410968780518, "learning_rate": 6.197244324423925e-07, "loss": 1.9746, "num_input_tokens_seen": 328170464, "step": 322300 }, { "epoch": 6.347581264397236, "grad_norm": 1.8098764419555664, "learning_rate": 6.195242397862043e-07, "loss": 1.9834, "num_input_tokens_seen": 328272120, "step": 322400 }, { "epoch": 6.349550117146738, "grad_norm": 2.0858922004699707, "learning_rate": 6.193240268059275e-07, "loss": 1.9511, "num_input_tokens_seen": 328373880, "step": 322500 }, { "epoch": 6.351518969896241, "grad_norm": 1.7641922235488892, "learning_rate": 6.191237935356062e-07, "loss": 1.9747, "num_input_tokens_seen": 328475704, "step": 322600 }, { "epoch": 6.353487822645745, "grad_norm": 1.9336637258529663, "learning_rate": 6.189235400092885e-07, "loss": 1.9615, "num_input_tokens_seen": 328578104, "step": 322700 }, { "epoch": 6.3554566753952475, "grad_norm": 1.7192039489746094, "learning_rate": 6.187232662610259e-07, "loss": 1.9806, "num_input_tokens_seen": 328679928, "step": 322800 }, { "epoch": 6.35742552814475, "grad_norm": 1.8622206449508667, "learning_rate": 6.185229723248731e-07, "loss": 1.9712, "num_input_tokens_seen": 328781864, "step": 322900 }, { "epoch": 6.359394380894253, "grad_norm": 6.362214088439941, "learning_rate": 6.183226582348888e-07, "loss": 1.9722, "num_input_tokens_seen": 328883384, "step": 323000 }, { "epoch": 6.361363233643756, "grad_norm": 2.1649420261383057, "learning_rate": 6.181223240251343e-07, "loss": 1.9865, "num_input_tokens_seen": 328985784, "step": 323100 }, { "epoch": 6.3633320863932585, "grad_norm": 3.7690231800079346, "learning_rate": 6.17921969729675e-07, "loss": 1.9655, "num_input_tokens_seen": 329087640, "step": 323200 }, { "epoch": 6.365300939142761, "grad_norm": 1.8056912422180176, "learning_rate": 6.177215953825792e-07, "loss": 1.9763, "num_input_tokens_seen": 329190040, "step": 323300 }, { "epoch": 6.367269791892264, "grad_norm": 1.8254534006118774, "learning_rate": 6.175212010179191e-07, "loss": 1.9809, "num_input_tokens_seen": 329291960, "step": 323400 }, { "epoch": 6.369238644641767, "grad_norm": 1.7076184749603271, "learning_rate": 6.173207866697699e-07, "loss": 2.0051, "num_input_tokens_seen": 329392688, "step": 323500 }, { "epoch": 6.3712074973912705, "grad_norm": 1.8180934190750122, "learning_rate": 6.171203523722106e-07, "loss": 2.0257, "num_input_tokens_seen": 329494464, "step": 323600 }, { "epoch": 6.373176350140773, "grad_norm": 1.7735439538955688, "learning_rate": 6.169198981593231e-07, "loss": 1.958, "num_input_tokens_seen": 329596000, "step": 323700 }, { "epoch": 6.375145202890276, "grad_norm": 1.832324743270874, "learning_rate": 6.167194240651931e-07, "loss": 2.0022, "num_input_tokens_seen": 329698400, "step": 323800 }, { "epoch": 6.377114055639779, "grad_norm": 1.8855369091033936, "learning_rate": 6.165189301239094e-07, "loss": 1.9617, "num_input_tokens_seen": 329800088, "step": 323900 }, { "epoch": 6.3790829083892815, "grad_norm": 2.000969409942627, "learning_rate": 6.163184163695646e-07, "loss": 1.9507, "num_input_tokens_seen": 329901672, "step": 324000 }, { "epoch": 6.381051761138784, "grad_norm": 1.8009623289108276, "learning_rate": 6.16117882836254e-07, "loss": 2.0282, "num_input_tokens_seen": 330003256, "step": 324100 }, { "epoch": 6.383020613888287, "grad_norm": 2.3676700592041016, "learning_rate": 6.159173295580769e-07, "loss": 1.9634, "num_input_tokens_seen": 330105136, "step": 324200 }, { "epoch": 6.38498946663779, "grad_norm": 2.029550552368164, "learning_rate": 6.157167565691356e-07, "loss": 1.9662, "num_input_tokens_seen": 330207536, "step": 324300 }, { "epoch": 6.386958319387293, "grad_norm": 2.4419150352478027, "learning_rate": 6.155161639035359e-07, "loss": 2.033, "num_input_tokens_seen": 330308104, "step": 324400 }, { "epoch": 6.388927172136796, "grad_norm": 1.8690277338027954, "learning_rate": 6.153155515953867e-07, "loss": 1.9661, "num_input_tokens_seen": 330410504, "step": 324500 }, { "epoch": 6.390896024886299, "grad_norm": 1.9267312288284302, "learning_rate": 6.151149196788007e-07, "loss": 1.9667, "num_input_tokens_seen": 330511512, "step": 324600 }, { "epoch": 6.392864877635802, "grad_norm": 2.4113211631774902, "learning_rate": 6.149142681878934e-07, "loss": 2.0224, "num_input_tokens_seen": 330613056, "step": 324700 }, { "epoch": 6.3948337303853044, "grad_norm": 1.7615582942962646, "learning_rate": 6.147135971567841e-07, "loss": 1.9618, "num_input_tokens_seen": 330715456, "step": 324800 }, { "epoch": 6.396802583134807, "grad_norm": 1.7147644758224487, "learning_rate": 6.145129066195951e-07, "loss": 1.9512, "num_input_tokens_seen": 330817856, "step": 324900 }, { "epoch": 6.39877143588431, "grad_norm": 2.0383174419403076, "learning_rate": 6.143121966104522e-07, "loss": 1.9715, "num_input_tokens_seen": 330919608, "step": 325000 }, { "epoch": 6.400740288633813, "grad_norm": 2.049348831176758, "learning_rate": 6.141114671634844e-07, "loss": 1.9778, "num_input_tokens_seen": 331022008, "step": 325100 }, { "epoch": 6.4027091413833155, "grad_norm": 1.7986857891082764, "learning_rate": 6.139107183128241e-07, "loss": 1.9972, "num_input_tokens_seen": 331122512, "step": 325200 }, { "epoch": 6.404677994132819, "grad_norm": 2.0284037590026855, "learning_rate": 6.13709950092607e-07, "loss": 1.9836, "num_input_tokens_seen": 331224008, "step": 325300 }, { "epoch": 6.406646846882322, "grad_norm": 2.012815475463867, "learning_rate": 6.135091625369718e-07, "loss": 1.989, "num_input_tokens_seen": 331325904, "step": 325400 }, { "epoch": 6.408615699631825, "grad_norm": 1.916353464126587, "learning_rate": 6.133083556800609e-07, "loss": 1.9713, "num_input_tokens_seen": 331427776, "step": 325500 }, { "epoch": 6.410584552381327, "grad_norm": 2.093794345855713, "learning_rate": 6.1310752955602e-07, "loss": 2.0217, "num_input_tokens_seen": 331530176, "step": 325600 }, { "epoch": 6.41255340513083, "grad_norm": 1.9516007900238037, "learning_rate": 6.129066841989974e-07, "loss": 1.9553, "num_input_tokens_seen": 331631640, "step": 325700 }, { "epoch": 6.414522257880333, "grad_norm": 1.891526222229004, "learning_rate": 6.127058196431457e-07, "loss": 1.9457, "num_input_tokens_seen": 331734040, "step": 325800 }, { "epoch": 6.416491110629836, "grad_norm": 2.3124842643737793, "learning_rate": 6.125049359226199e-07, "loss": 1.9858, "num_input_tokens_seen": 331835920, "step": 325900 }, { "epoch": 6.418459963379339, "grad_norm": 1.8841344118118286, "learning_rate": 6.123040330715785e-07, "loss": 1.9908, "num_input_tokens_seen": 331938320, "step": 326000 }, { "epoch": 6.420428816128842, "grad_norm": 1.7686660289764404, "learning_rate": 6.121031111241837e-07, "loss": 1.9863, "num_input_tokens_seen": 332040720, "step": 326100 }, { "epoch": 6.422397668878345, "grad_norm": 1.8571674823760986, "learning_rate": 6.119021701146003e-07, "loss": 1.9864, "num_input_tokens_seen": 332142592, "step": 326200 }, { "epoch": 6.424366521627848, "grad_norm": 1.7800183296203613, "learning_rate": 6.117012100769967e-07, "loss": 1.9638, "num_input_tokens_seen": 332244992, "step": 326300 }, { "epoch": 6.42633537437735, "grad_norm": 2.127934455871582, "learning_rate": 6.115002310455445e-07, "loss": 1.9432, "num_input_tokens_seen": 332346752, "step": 326400 }, { "epoch": 6.428304227126853, "grad_norm": 1.9178853034973145, "learning_rate": 6.112992330544184e-07, "loss": 1.9676, "num_input_tokens_seen": 332448616, "step": 326500 }, { "epoch": 6.430273079876356, "grad_norm": 2.1226372718811035, "learning_rate": 6.110982161377964e-07, "loss": 2.0212, "num_input_tokens_seen": 332551016, "step": 326600 }, { "epoch": 6.432241932625859, "grad_norm": 1.699806571006775, "learning_rate": 6.1089718032986e-07, "loss": 1.996, "num_input_tokens_seen": 332653416, "step": 326700 }, { "epoch": 6.434210785375361, "grad_norm": 1.8614561557769775, "learning_rate": 6.106961256647932e-07, "loss": 1.9219, "num_input_tokens_seen": 332755064, "step": 326800 }, { "epoch": 6.436179638124865, "grad_norm": 1.6365993022918701, "learning_rate": 6.10495052176784e-07, "loss": 2.0384, "num_input_tokens_seen": 332855528, "step": 326900 }, { "epoch": 6.438148490874368, "grad_norm": 1.9996424913406372, "learning_rate": 6.102939599000231e-07, "loss": 1.9601, "num_input_tokens_seen": 332957248, "step": 327000 }, { "epoch": 6.440117343623871, "grad_norm": 1.8351149559020996, "learning_rate": 6.100928488687046e-07, "loss": 1.9764, "num_input_tokens_seen": 333059648, "step": 327100 }, { "epoch": 6.442086196373373, "grad_norm": 2.0287539958953857, "learning_rate": 6.098917191170257e-07, "loss": 2.0064, "num_input_tokens_seen": 333161352, "step": 327200 }, { "epoch": 6.444055049122876, "grad_norm": 2.030189037322998, "learning_rate": 6.096905706791869e-07, "loss": 1.9555, "num_input_tokens_seen": 333263152, "step": 327300 }, { "epoch": 6.446023901872379, "grad_norm": 2.1728546619415283, "learning_rate": 6.094894035893916e-07, "loss": 1.946, "num_input_tokens_seen": 333365552, "step": 327400 }, { "epoch": 6.447992754621882, "grad_norm": 1.9530166387557983, "learning_rate": 6.092882178818468e-07, "loss": 1.9646, "num_input_tokens_seen": 333467288, "step": 327500 }, { "epoch": 6.449961607371384, "grad_norm": 1.7757498025894165, "learning_rate": 6.090870135907623e-07, "loss": 2.0014, "num_input_tokens_seen": 333569056, "step": 327600 }, { "epoch": 6.451930460120888, "grad_norm": 1.9455411434173584, "learning_rate": 6.088857907503513e-07, "loss": 1.9777, "num_input_tokens_seen": 333670616, "step": 327700 }, { "epoch": 6.453899312870391, "grad_norm": 1.9355013370513916, "learning_rate": 6.086845493948299e-07, "loss": 1.9434, "num_input_tokens_seen": 333773016, "step": 327800 }, { "epoch": 6.4558681656198935, "grad_norm": 2.1994781494140625, "learning_rate": 6.084832895584174e-07, "loss": 1.9661, "num_input_tokens_seen": 333875416, "step": 327900 }, { "epoch": 6.457837018369396, "grad_norm": 2.1956112384796143, "learning_rate": 6.082820112753367e-07, "loss": 2.0031, "num_input_tokens_seen": 333976984, "step": 328000 }, { "epoch": 6.459805871118899, "grad_norm": 1.8465567827224731, "learning_rate": 6.080807145798133e-07, "loss": 2.0056, "num_input_tokens_seen": 334078616, "step": 328100 }, { "epoch": 6.461774723868402, "grad_norm": 1.9540822505950928, "learning_rate": 6.078793995060758e-07, "loss": 2.0113, "num_input_tokens_seen": 334178672, "step": 328200 }, { "epoch": 6.4637435766179046, "grad_norm": 1.91953706741333, "learning_rate": 6.076780660883565e-07, "loss": 1.9309, "num_input_tokens_seen": 334280160, "step": 328300 }, { "epoch": 6.465712429367407, "grad_norm": 2.0835020542144775, "learning_rate": 6.074767143608901e-07, "loss": 1.9724, "num_input_tokens_seen": 334382128, "step": 328400 }, { "epoch": 6.46768128211691, "grad_norm": 1.8535174131393433, "learning_rate": 6.072753443579151e-07, "loss": 2.0113, "num_input_tokens_seen": 334483960, "step": 328500 }, { "epoch": 6.469650134866414, "grad_norm": 1.879845380783081, "learning_rate": 6.070739561136724e-07, "loss": 2.0152, "num_input_tokens_seen": 334585072, "step": 328600 }, { "epoch": 6.4716189876159165, "grad_norm": 2.101710081100464, "learning_rate": 6.068725496624068e-07, "loss": 1.9484, "num_input_tokens_seen": 334687472, "step": 328700 }, { "epoch": 6.473587840365419, "grad_norm": 2.185837507247925, "learning_rate": 6.066711250383653e-07, "loss": 1.9737, "num_input_tokens_seen": 334789160, "step": 328800 }, { "epoch": 6.475556693114922, "grad_norm": 1.8057156801223755, "learning_rate": 6.064696822757988e-07, "loss": 1.9503, "num_input_tokens_seen": 334891560, "step": 328900 }, { "epoch": 6.477525545864425, "grad_norm": 1.943167805671692, "learning_rate": 6.062682214089608e-07, "loss": 1.97, "num_input_tokens_seen": 334993960, "step": 329000 }, { "epoch": 6.4794943986139275, "grad_norm": 1.9640806913375854, "learning_rate": 6.060667424721081e-07, "loss": 2.0122, "num_input_tokens_seen": 335096360, "step": 329100 }, { "epoch": 6.48146325136343, "grad_norm": 2.066638708114624, "learning_rate": 6.058652454995004e-07, "loss": 1.9992, "num_input_tokens_seen": 335198072, "step": 329200 }, { "epoch": 6.483432104112933, "grad_norm": 2.1574697494506836, "learning_rate": 6.056637305254007e-07, "loss": 1.9994, "num_input_tokens_seen": 335299576, "step": 329300 }, { "epoch": 6.485400956862437, "grad_norm": 2.0422778129577637, "learning_rate": 6.054621975840748e-07, "loss": 1.9752, "num_input_tokens_seen": 335401360, "step": 329400 }, { "epoch": 6.487369809611939, "grad_norm": 2.1192007064819336, "learning_rate": 6.052606467097917e-07, "loss": 2.0178, "num_input_tokens_seen": 335503760, "step": 329500 }, { "epoch": 6.489338662361442, "grad_norm": 1.8772376775741577, "learning_rate": 6.050590779368234e-07, "loss": 1.967, "num_input_tokens_seen": 335605520, "step": 329600 }, { "epoch": 6.491307515110945, "grad_norm": 1.9499151706695557, "learning_rate": 6.048574912994451e-07, "loss": 1.9705, "num_input_tokens_seen": 335707920, "step": 329700 }, { "epoch": 6.493276367860448, "grad_norm": 2.202011823654175, "learning_rate": 6.046558868319347e-07, "loss": 1.9384, "num_input_tokens_seen": 335810320, "step": 329800 }, { "epoch": 6.4952452206099505, "grad_norm": 1.7260181903839111, "learning_rate": 6.044542645685735e-07, "loss": 1.9509, "num_input_tokens_seen": 335912720, "step": 329900 }, { "epoch": 6.497214073359453, "grad_norm": 1.5298411846160889, "learning_rate": 6.042526245436455e-07, "loss": 1.9802, "num_input_tokens_seen": 336014264, "step": 330000 }, { "epoch": 6.499182926108956, "grad_norm": 2.025395631790161, "learning_rate": 6.040509667914382e-07, "loss": 2.009, "num_input_tokens_seen": 336116576, "step": 330100 }, { "epoch": 6.501151778858459, "grad_norm": 2.1979007720947266, "learning_rate": 6.038492913462417e-07, "loss": 1.9367, "num_input_tokens_seen": 336218976, "step": 330200 }, { "epoch": 6.503120631607962, "grad_norm": 2.0642919540405273, "learning_rate": 6.036475982423488e-07, "loss": 1.9565, "num_input_tokens_seen": 336319424, "step": 330300 }, { "epoch": 6.505089484357465, "grad_norm": 1.9608566761016846, "learning_rate": 6.034458875140563e-07, "loss": 1.9525, "num_input_tokens_seen": 336420968, "step": 330400 }, { "epoch": 6.507058337106968, "grad_norm": 1.8452330827713013, "learning_rate": 6.032441591956628e-07, "loss": 1.9453, "num_input_tokens_seen": 336523368, "step": 330500 }, { "epoch": 6.509027189856471, "grad_norm": 2.0573019981384277, "learning_rate": 6.030424133214711e-07, "loss": 1.9638, "num_input_tokens_seen": 336624960, "step": 330600 }, { "epoch": 6.510996042605973, "grad_norm": 1.7164239883422852, "learning_rate": 6.02840649925786e-07, "loss": 1.9829, "num_input_tokens_seen": 336727360, "step": 330700 }, { "epoch": 6.512964895355476, "grad_norm": 1.9546500444412231, "learning_rate": 6.026388690429158e-07, "loss": 2.0137, "num_input_tokens_seen": 336828952, "step": 330800 }, { "epoch": 6.514933748104979, "grad_norm": 1.9857194423675537, "learning_rate": 6.024370707071717e-07, "loss": 1.948, "num_input_tokens_seen": 336930544, "step": 330900 }, { "epoch": 6.516902600854483, "grad_norm": 2.175293207168579, "learning_rate": 6.022352549528675e-07, "loss": 1.9875, "num_input_tokens_seen": 337032312, "step": 331000 }, { "epoch": 6.518871453603985, "grad_norm": 2.0272483825683594, "learning_rate": 6.020334218143206e-07, "loss": 2.0037, "num_input_tokens_seen": 337133152, "step": 331100 }, { "epoch": 6.520840306353488, "grad_norm": 1.8052414655685425, "learning_rate": 6.01831571325851e-07, "loss": 1.9783, "num_input_tokens_seen": 337234616, "step": 331200 }, { "epoch": 6.522809159102991, "grad_norm": 2.2724173069000244, "learning_rate": 6.016297035217816e-07, "loss": 1.9855, "num_input_tokens_seen": 337337016, "step": 331300 }, { "epoch": 6.524778011852494, "grad_norm": 1.8018869161605835, "learning_rate": 6.014278184364382e-07, "loss": 2.0007, "num_input_tokens_seen": 337439128, "step": 331400 }, { "epoch": 6.526746864601996, "grad_norm": 2.0168981552124023, "learning_rate": 6.012259161041497e-07, "loss": 1.9533, "num_input_tokens_seen": 337540664, "step": 331500 }, { "epoch": 6.528715717351499, "grad_norm": 1.9451590776443481, "learning_rate": 6.010239965592482e-07, "loss": 2.0135, "num_input_tokens_seen": 337641112, "step": 331600 }, { "epoch": 6.530684570101002, "grad_norm": 2.2448904514312744, "learning_rate": 6.008220598360679e-07, "loss": 1.9419, "num_input_tokens_seen": 337742960, "step": 331700 }, { "epoch": 6.532653422850505, "grad_norm": 1.8448795080184937, "learning_rate": 6.006201059689468e-07, "loss": 1.982, "num_input_tokens_seen": 337845360, "step": 331800 }, { "epoch": 6.534622275600007, "grad_norm": 1.9624658823013306, "learning_rate": 6.004181349922253e-07, "loss": 2.0188, "num_input_tokens_seen": 337947008, "step": 331900 }, { "epoch": 6.536591128349511, "grad_norm": 1.9757168292999268, "learning_rate": 6.002161469402469e-07, "loss": 1.9757, "num_input_tokens_seen": 338049408, "step": 332000 }, { "epoch": 6.538559981099014, "grad_norm": 1.7830297946929932, "learning_rate": 6.000141418473581e-07, "loss": 1.9633, "num_input_tokens_seen": 338150376, "step": 332100 }, { "epoch": 6.540528833848517, "grad_norm": 5.492066860198975, "learning_rate": 5.99812119747908e-07, "loss": 2.0123, "num_input_tokens_seen": 338251064, "step": 332200 }, { "epoch": 6.542497686598019, "grad_norm": 2.2998900413513184, "learning_rate": 5.996100806762487e-07, "loss": 2.0188, "num_input_tokens_seen": 338353016, "step": 332300 }, { "epoch": 6.544466539347522, "grad_norm": 1.9289929866790771, "learning_rate": 5.994080246667354e-07, "loss": 1.9853, "num_input_tokens_seen": 338455416, "step": 332400 }, { "epoch": 6.546435392097025, "grad_norm": 1.9939790964126587, "learning_rate": 5.992059517537259e-07, "loss": 1.969, "num_input_tokens_seen": 338557264, "step": 332500 }, { "epoch": 6.548404244846528, "grad_norm": 3.035196542739868, "learning_rate": 5.990038619715811e-07, "loss": 2.0201, "num_input_tokens_seen": 338658312, "step": 332600 }, { "epoch": 6.550373097596031, "grad_norm": 2.042783498764038, "learning_rate": 5.988017553546646e-07, "loss": 2.0123, "num_input_tokens_seen": 338759352, "step": 332700 }, { "epoch": 6.552341950345534, "grad_norm": 2.005141258239746, "learning_rate": 5.98599631937343e-07, "loss": 2.0006, "num_input_tokens_seen": 338861536, "step": 332800 }, { "epoch": 6.554310803095037, "grad_norm": 1.9736496210098267, "learning_rate": 5.983974917539855e-07, "loss": 1.9879, "num_input_tokens_seen": 338963936, "step": 332900 }, { "epoch": 6.5562796558445395, "grad_norm": 1.995004415512085, "learning_rate": 5.981953348389645e-07, "loss": 2.0082, "num_input_tokens_seen": 339066336, "step": 333000 }, { "epoch": 6.558248508594042, "grad_norm": 2.825413227081299, "learning_rate": 5.97993161226655e-07, "loss": 1.9956, "num_input_tokens_seen": 339168080, "step": 333100 }, { "epoch": 6.560217361343545, "grad_norm": 10.762933731079102, "learning_rate": 5.977909709514349e-07, "loss": 1.9451, "num_input_tokens_seen": 339270480, "step": 333200 }, { "epoch": 6.562186214093048, "grad_norm": 2.0744829177856445, "learning_rate": 5.975887640476852e-07, "loss": 2.0153, "num_input_tokens_seen": 339370784, "step": 333300 }, { "epoch": 6.564155066842551, "grad_norm": 1.7472552061080933, "learning_rate": 5.97386540549789e-07, "loss": 1.9787, "num_input_tokens_seen": 339472264, "step": 333400 }, { "epoch": 6.566123919592053, "grad_norm": 1.8035584688186646, "learning_rate": 5.971843004921332e-07, "loss": 1.9873, "num_input_tokens_seen": 339573752, "step": 333500 }, { "epoch": 6.568092772341556, "grad_norm": 1.9936245679855347, "learning_rate": 5.969820439091066e-07, "loss": 2.0252, "num_input_tokens_seen": 339676152, "step": 333600 }, { "epoch": 6.57006162509106, "grad_norm": 2.026432991027832, "learning_rate": 5.967797708351016e-07, "loss": 1.9662, "num_input_tokens_seen": 339778552, "step": 333700 }, { "epoch": 6.5720304778405625, "grad_norm": 2.013417959213257, "learning_rate": 5.965774813045128e-07, "loss": 1.9947, "num_input_tokens_seen": 339880824, "step": 333800 }, { "epoch": 6.573999330590065, "grad_norm": 1.8835017681121826, "learning_rate": 5.963751753517378e-07, "loss": 1.9986, "num_input_tokens_seen": 339983224, "step": 333900 }, { "epoch": 6.575968183339568, "grad_norm": 2.0019731521606445, "learning_rate": 5.961728530111771e-07, "loss": 1.9697, "num_input_tokens_seen": 340084880, "step": 334000 }, { "epoch": 6.577937036089071, "grad_norm": 1.8406524658203125, "learning_rate": 5.95970514317234e-07, "loss": 2.0124, "num_input_tokens_seen": 340187280, "step": 334100 }, { "epoch": 6.5799058888385735, "grad_norm": 2.345460891723633, "learning_rate": 5.957681593043144e-07, "loss": 1.9799, "num_input_tokens_seen": 340288896, "step": 334200 }, { "epoch": 6.581874741588076, "grad_norm": 1.9273667335510254, "learning_rate": 5.95565788006827e-07, "loss": 1.9683, "num_input_tokens_seen": 340391296, "step": 334300 }, { "epoch": 6.58384359433758, "grad_norm": 1.9310986995697021, "learning_rate": 5.953634004591833e-07, "loss": 1.9453, "num_input_tokens_seen": 340491568, "step": 334400 }, { "epoch": 6.585812447087083, "grad_norm": 1.782335638999939, "learning_rate": 5.951609966957978e-07, "loss": 1.9641, "num_input_tokens_seen": 340592192, "step": 334500 }, { "epoch": 6.5877812998365854, "grad_norm": 1.9793280363082886, "learning_rate": 5.949585767510874e-07, "loss": 1.9991, "num_input_tokens_seen": 340694592, "step": 334600 }, { "epoch": 6.589750152586088, "grad_norm": 1.798757791519165, "learning_rate": 5.94756140659472e-07, "loss": 1.9617, "num_input_tokens_seen": 340796992, "step": 334700 }, { "epoch": 6.591719005335591, "grad_norm": 1.9851182699203491, "learning_rate": 5.94553688455374e-07, "loss": 2.0403, "num_input_tokens_seen": 340899008, "step": 334800 }, { "epoch": 6.593687858085094, "grad_norm": 1.9218271970748901, "learning_rate": 5.943512201732189e-07, "loss": 2.0017, "num_input_tokens_seen": 341001408, "step": 334900 }, { "epoch": 6.5956567108345965, "grad_norm": 1.775185465812683, "learning_rate": 5.941487358474344e-07, "loss": 1.9468, "num_input_tokens_seen": 341102960, "step": 335000 }, { "epoch": 6.597625563584099, "grad_norm": 2.445472002029419, "learning_rate": 5.939462355124516e-07, "loss": 2.0083, "num_input_tokens_seen": 341204744, "step": 335100 }, { "epoch": 6.599594416333602, "grad_norm": 4.558581829071045, "learning_rate": 5.937437192027037e-07, "loss": 2.0086, "num_input_tokens_seen": 341305304, "step": 335200 }, { "epoch": 6.601563269083106, "grad_norm": 2.0440187454223633, "learning_rate": 5.935411869526273e-07, "loss": 1.9781, "num_input_tokens_seen": 341407704, "step": 335300 }, { "epoch": 6.603532121832608, "grad_norm": 1.8654133081436157, "learning_rate": 5.933386387966608e-07, "loss": 1.9822, "num_input_tokens_seen": 341510104, "step": 335400 }, { "epoch": 6.605500974582111, "grad_norm": 1.9177144765853882, "learning_rate": 5.931360747692461e-07, "loss": 2.0323, "num_input_tokens_seen": 341612504, "step": 335500 }, { "epoch": 6.607469827331614, "grad_norm": 1.9475739002227783, "learning_rate": 5.929334949048274e-07, "loss": 1.9899, "num_input_tokens_seen": 341714080, "step": 335600 }, { "epoch": 6.609438680081117, "grad_norm": 1.7756541967391968, "learning_rate": 5.927308992378518e-07, "loss": 2.0027, "num_input_tokens_seen": 341816480, "step": 335700 }, { "epoch": 6.611407532830619, "grad_norm": 2.0020506381988525, "learning_rate": 5.925282878027688e-07, "loss": 2.004, "num_input_tokens_seen": 341918184, "step": 335800 }, { "epoch": 6.613376385580122, "grad_norm": 1.7956039905548096, "learning_rate": 5.92325660634031e-07, "loss": 2.0263, "num_input_tokens_seen": 342019904, "step": 335900 }, { "epoch": 6.615345238329625, "grad_norm": 2.057025194168091, "learning_rate": 5.921230177660932e-07, "loss": 2.0009, "num_input_tokens_seen": 342122304, "step": 336000 }, { "epoch": 6.617314091079129, "grad_norm": 1.800400733947754, "learning_rate": 5.919203592334132e-07, "loss": 2.002, "num_input_tokens_seen": 342224272, "step": 336100 }, { "epoch": 6.619282943828631, "grad_norm": 2.387866973876953, "learning_rate": 5.917176850704515e-07, "loss": 1.9729, "num_input_tokens_seen": 342326672, "step": 336200 }, { "epoch": 6.621251796578134, "grad_norm": 1.7869130373001099, "learning_rate": 5.91514995311671e-07, "loss": 2.0465, "num_input_tokens_seen": 342426264, "step": 336300 }, { "epoch": 6.623220649327637, "grad_norm": 2.065978527069092, "learning_rate": 5.913122899915375e-07, "loss": 1.9889, "num_input_tokens_seen": 342528032, "step": 336400 }, { "epoch": 6.62518950207714, "grad_norm": 1.9413907527923584, "learning_rate": 5.911095691445192e-07, "loss": 1.9909, "num_input_tokens_seen": 342629840, "step": 336500 }, { "epoch": 6.627158354826642, "grad_norm": 1.8402358293533325, "learning_rate": 5.909068328050871e-07, "loss": 1.9538, "num_input_tokens_seen": 342732240, "step": 336600 }, { "epoch": 6.629127207576145, "grad_norm": 1.9476884603500366, "learning_rate": 5.907040810077149e-07, "loss": 1.9459, "num_input_tokens_seen": 342834640, "step": 336700 }, { "epoch": 6.631096060325648, "grad_norm": 1.9057573080062866, "learning_rate": 5.905013137868786e-07, "loss": 1.948, "num_input_tokens_seen": 342937040, "step": 336800 }, { "epoch": 6.633064913075151, "grad_norm": 1.87544584274292, "learning_rate": 5.902985311770574e-07, "loss": 1.994, "num_input_tokens_seen": 343038568, "step": 336900 }, { "epoch": 6.635033765824654, "grad_norm": 1.833757758140564, "learning_rate": 5.900957332127324e-07, "loss": 1.9739, "num_input_tokens_seen": 343140968, "step": 337000 }, { "epoch": 6.637002618574157, "grad_norm": 1.8252379894256592, "learning_rate": 5.898929199283879e-07, "loss": 2.0175, "num_input_tokens_seen": 343241880, "step": 337100 }, { "epoch": 6.63897147132366, "grad_norm": 1.8570109605789185, "learning_rate": 5.896900913585108e-07, "loss": 1.9806, "num_input_tokens_seen": 343344232, "step": 337200 }, { "epoch": 6.640940324073163, "grad_norm": 1.637611985206604, "learning_rate": 5.894872475375898e-07, "loss": 1.9897, "num_input_tokens_seen": 343445784, "step": 337300 }, { "epoch": 6.642909176822665, "grad_norm": 1.9490511417388916, "learning_rate": 5.892843885001174e-07, "loss": 1.9749, "num_input_tokens_seen": 343548184, "step": 337400 }, { "epoch": 6.644878029572168, "grad_norm": 2.083259344100952, "learning_rate": 5.890815142805877e-07, "loss": 2.027, "num_input_tokens_seen": 343650584, "step": 337500 }, { "epoch": 6.646846882321671, "grad_norm": 1.9399176836013794, "learning_rate": 5.88878624913498e-07, "loss": 1.9785, "num_input_tokens_seen": 343752984, "step": 337600 }, { "epoch": 6.6488157350711745, "grad_norm": 2.3614978790283203, "learning_rate": 5.886757204333474e-07, "loss": 1.9714, "num_input_tokens_seen": 343853312, "step": 337700 }, { "epoch": 6.650784587820677, "grad_norm": 2.3670032024383545, "learning_rate": 5.884728008746388e-07, "loss": 1.9606, "num_input_tokens_seen": 343955088, "step": 337800 }, { "epoch": 6.65275344057018, "grad_norm": 1.8101786375045776, "learning_rate": 5.882698662718765e-07, "loss": 1.9502, "num_input_tokens_seen": 344057488, "step": 337900 }, { "epoch": 6.654722293319683, "grad_norm": 1.8339052200317383, "learning_rate": 5.880669166595679e-07, "loss": 1.9567, "num_input_tokens_seen": 344159272, "step": 338000 }, { "epoch": 6.6566911460691855, "grad_norm": 1.7340331077575684, "learning_rate": 5.878639520722228e-07, "loss": 1.9406, "num_input_tokens_seen": 344261216, "step": 338100 }, { "epoch": 6.658659998818688, "grad_norm": 1.6825907230377197, "learning_rate": 5.87660972544354e-07, "loss": 1.9812, "num_input_tokens_seen": 344363616, "step": 338200 }, { "epoch": 6.660628851568191, "grad_norm": 1.6769567728042603, "learning_rate": 5.874579781104761e-07, "loss": 1.9939, "num_input_tokens_seen": 344466016, "step": 338300 }, { "epoch": 6.662597704317694, "grad_norm": 1.7573374509811401, "learning_rate": 5.872549688051066e-07, "loss": 1.9588, "num_input_tokens_seen": 344567736, "step": 338400 }, { "epoch": 6.664566557067197, "grad_norm": 1.8433133363723755, "learning_rate": 5.870519446627656e-07, "loss": 2.0107, "num_input_tokens_seen": 344666912, "step": 338500 }, { "epoch": 6.666535409816699, "grad_norm": 1.5975202322006226, "learning_rate": 5.868489057179757e-07, "loss": 1.9665, "num_input_tokens_seen": 344768880, "step": 338600 }, { "epoch": 6.668504262566203, "grad_norm": 1.6781902313232422, "learning_rate": 5.86645852005262e-07, "loss": 1.9924, "num_input_tokens_seen": 344870384, "step": 338700 }, { "epoch": 6.670473115315706, "grad_norm": 1.949173092842102, "learning_rate": 5.864427835591518e-07, "loss": 1.9494, "num_input_tokens_seen": 344971976, "step": 338800 }, { "epoch": 6.6724419680652085, "grad_norm": 1.9719985723495483, "learning_rate": 5.862397004141754e-07, "loss": 2.0234, "num_input_tokens_seen": 345072800, "step": 338900 }, { "epoch": 6.674410820814711, "grad_norm": 1.8209240436553955, "learning_rate": 5.860366026048654e-07, "loss": 2.0197, "num_input_tokens_seen": 345173656, "step": 339000 }, { "epoch": 6.676379673564214, "grad_norm": 2.11574649810791, "learning_rate": 5.858334901657565e-07, "loss": 1.9947, "num_input_tokens_seen": 345275320, "step": 339100 }, { "epoch": 6.678348526313717, "grad_norm": 1.870963215827942, "learning_rate": 5.85630363131387e-07, "loss": 2.0067, "num_input_tokens_seen": 345376992, "step": 339200 }, { "epoch": 6.6803173790632195, "grad_norm": 1.9651352167129517, "learning_rate": 5.85427221536296e-07, "loss": 2.0146, "num_input_tokens_seen": 345479392, "step": 339300 }, { "epoch": 6.682286231812723, "grad_norm": 2.019988536834717, "learning_rate": 5.852240654150268e-07, "loss": 2.0072, "num_input_tokens_seen": 345581216, "step": 339400 }, { "epoch": 6.684255084562226, "grad_norm": 1.876074194908142, "learning_rate": 5.850208948021241e-07, "loss": 2.0149, "num_input_tokens_seen": 345683072, "step": 339500 }, { "epoch": 6.686223937311729, "grad_norm": 1.852657437324524, "learning_rate": 5.848177097321354e-07, "loss": 1.9422, "num_input_tokens_seen": 345785472, "step": 339600 }, { "epoch": 6.6881927900612315, "grad_norm": 1.9414300918579102, "learning_rate": 5.846145102396104e-07, "loss": 1.916, "num_input_tokens_seen": 345887872, "step": 339700 }, { "epoch": 6.690161642810734, "grad_norm": 1.7268099784851074, "learning_rate": 5.844112963591017e-07, "loss": 1.9643, "num_input_tokens_seen": 345990272, "step": 339800 }, { "epoch": 6.692130495560237, "grad_norm": 1.8921703100204468, "learning_rate": 5.842080681251638e-07, "loss": 2.0061, "num_input_tokens_seen": 346091328, "step": 339900 }, { "epoch": 6.69409934830974, "grad_norm": 2.14228892326355, "learning_rate": 5.840048255723544e-07, "loss": 2.013, "num_input_tokens_seen": 346192800, "step": 340000 }, { "epoch": 6.6960682010592425, "grad_norm": 1.9993531703948975, "learning_rate": 5.838015687352327e-07, "loss": 1.9896, "num_input_tokens_seen": 346294408, "step": 340100 }, { "epoch": 6.698037053808745, "grad_norm": 1.7952618598937988, "learning_rate": 5.835982976483613e-07, "loss": 1.9382, "num_input_tokens_seen": 346396808, "step": 340200 }, { "epoch": 6.700005906558248, "grad_norm": 1.634080410003662, "learning_rate": 5.833950123463043e-07, "loss": 1.9993, "num_input_tokens_seen": 346497864, "step": 340300 }, { "epoch": 6.701974759307752, "grad_norm": 2.08182692527771, "learning_rate": 5.83191712863629e-07, "loss": 1.961, "num_input_tokens_seen": 346600264, "step": 340400 }, { "epoch": 6.703943612057254, "grad_norm": 2.3181722164154053, "learning_rate": 5.829883992349045e-07, "loss": 1.985, "num_input_tokens_seen": 346701840, "step": 340500 }, { "epoch": 6.705912464806757, "grad_norm": 2.2417986392974854, "learning_rate": 5.827850714947028e-07, "loss": 1.9871, "num_input_tokens_seen": 346803280, "step": 340600 }, { "epoch": 6.70788131755626, "grad_norm": 2.027480363845825, "learning_rate": 5.825817296775978e-07, "loss": 1.9796, "num_input_tokens_seen": 346905576, "step": 340700 }, { "epoch": 6.709850170305763, "grad_norm": 1.8036468029022217, "learning_rate": 5.823783738181662e-07, "loss": 1.9752, "num_input_tokens_seen": 347007240, "step": 340800 }, { "epoch": 6.7118190230552655, "grad_norm": 2.3290023803710938, "learning_rate": 5.821750039509869e-07, "loss": 1.9913, "num_input_tokens_seen": 347109120, "step": 340900 }, { "epoch": 6.713787875804768, "grad_norm": 1.868285059928894, "learning_rate": 5.819716201106415e-07, "loss": 1.9771, "num_input_tokens_seen": 347211152, "step": 341000 }, { "epoch": 6.715756728554272, "grad_norm": 1.776397705078125, "learning_rate": 5.817682223317133e-07, "loss": 1.9602, "num_input_tokens_seen": 347313448, "step": 341100 }, { "epoch": 6.717725581303775, "grad_norm": 1.8410203456878662, "learning_rate": 5.815648106487885e-07, "loss": 2.0098, "num_input_tokens_seen": 347414544, "step": 341200 }, { "epoch": 6.719694434053277, "grad_norm": 1.9899818897247314, "learning_rate": 5.813613850964557e-07, "loss": 2.0203, "num_input_tokens_seen": 347515848, "step": 341300 }, { "epoch": 6.72166328680278, "grad_norm": 1.7764811515808105, "learning_rate": 5.811579457093056e-07, "loss": 2.0277, "num_input_tokens_seen": 347618248, "step": 341400 }, { "epoch": 6.723632139552283, "grad_norm": 1.8589383363723755, "learning_rate": 5.809544925219313e-07, "loss": 1.9778, "num_input_tokens_seen": 347720104, "step": 341500 }, { "epoch": 6.725600992301786, "grad_norm": 2.182234287261963, "learning_rate": 5.807510255689283e-07, "loss": 1.994, "num_input_tokens_seen": 347822504, "step": 341600 }, { "epoch": 6.727569845051288, "grad_norm": 1.8717601299285889, "learning_rate": 5.805475448848947e-07, "loss": 2.0046, "num_input_tokens_seen": 347924560, "step": 341700 }, { "epoch": 6.729538697800791, "grad_norm": 1.7180418968200684, "learning_rate": 5.803440505044302e-07, "loss": 1.9474, "num_input_tokens_seen": 348025520, "step": 341800 }, { "epoch": 6.731507550550294, "grad_norm": 1.8298027515411377, "learning_rate": 5.801405424621379e-07, "loss": 2.0128, "num_input_tokens_seen": 348126256, "step": 341900 }, { "epoch": 6.733476403299798, "grad_norm": 2.12819504737854, "learning_rate": 5.79937020792622e-07, "loss": 1.9743, "num_input_tokens_seen": 348228104, "step": 342000 }, { "epoch": 6.7354452560493, "grad_norm": 2.0564088821411133, "learning_rate": 5.797334855304898e-07, "loss": 1.9498, "num_input_tokens_seen": 348330504, "step": 342100 }, { "epoch": 6.737414108798803, "grad_norm": 2.293470859527588, "learning_rate": 5.795299367103511e-07, "loss": 1.9674, "num_input_tokens_seen": 348432904, "step": 342200 }, { "epoch": 6.739382961548306, "grad_norm": 1.7751963138580322, "learning_rate": 5.793263743668175e-07, "loss": 1.9997, "num_input_tokens_seen": 348535304, "step": 342300 }, { "epoch": 6.741351814297809, "grad_norm": 1.7971038818359375, "learning_rate": 5.79122798534503e-07, "loss": 1.9741, "num_input_tokens_seen": 348636496, "step": 342400 }, { "epoch": 6.743320667047311, "grad_norm": 1.8907898664474487, "learning_rate": 5.789192092480239e-07, "loss": 2.0057, "num_input_tokens_seen": 348736832, "step": 342500 }, { "epoch": 6.745289519796814, "grad_norm": 3.2937254905700684, "learning_rate": 5.787156065419989e-07, "loss": 1.9634, "num_input_tokens_seen": 348837472, "step": 342600 }, { "epoch": 6.747258372546317, "grad_norm": 1.9441038370132446, "learning_rate": 5.78511990451049e-07, "loss": 1.9541, "num_input_tokens_seen": 348939224, "step": 342700 }, { "epoch": 6.7492272252958205, "grad_norm": 1.8163578510284424, "learning_rate": 5.783083610097973e-07, "loss": 2.0377, "num_input_tokens_seen": 349040752, "step": 342800 }, { "epoch": 6.751196078045323, "grad_norm": 1.8017921447753906, "learning_rate": 5.781047182528693e-07, "loss": 1.9621, "num_input_tokens_seen": 349142576, "step": 342900 }, { "epoch": 6.753164930794826, "grad_norm": 2.1576220989227295, "learning_rate": 5.779010622148926e-07, "loss": 2.0212, "num_input_tokens_seen": 349244976, "step": 343000 }, { "epoch": 6.755133783544329, "grad_norm": 2.0112767219543457, "learning_rate": 5.776973929304976e-07, "loss": 1.9123, "num_input_tokens_seen": 349346712, "step": 343100 }, { "epoch": 6.757102636293832, "grad_norm": 1.8547894954681396, "learning_rate": 5.774937104343159e-07, "loss": 1.994, "num_input_tokens_seen": 349447256, "step": 343200 }, { "epoch": 6.759071489043334, "grad_norm": 2.012669324874878, "learning_rate": 5.772900147609825e-07, "loss": 2.0143, "num_input_tokens_seen": 349548592, "step": 343300 }, { "epoch": 6.761040341792837, "grad_norm": 1.7234598398208618, "learning_rate": 5.77086305945134e-07, "loss": 1.9973, "num_input_tokens_seen": 349650312, "step": 343400 }, { "epoch": 6.76300919454234, "grad_norm": 1.903673768043518, "learning_rate": 5.768825840214095e-07, "loss": 1.9502, "num_input_tokens_seen": 349752712, "step": 343500 }, { "epoch": 6.764978047291843, "grad_norm": 1.9555567502975464, "learning_rate": 5.766788490244498e-07, "loss": 2.0053, "num_input_tokens_seen": 349853808, "step": 343600 }, { "epoch": 6.766946900041346, "grad_norm": 2.0925943851470947, "learning_rate": 5.764751009888987e-07, "loss": 1.9851, "num_input_tokens_seen": 349955768, "step": 343700 }, { "epoch": 6.768915752790849, "grad_norm": 1.8598756790161133, "learning_rate": 5.762713399494015e-07, "loss": 1.9581, "num_input_tokens_seen": 350057696, "step": 343800 }, { "epoch": 6.770884605540352, "grad_norm": 2.1003823280334473, "learning_rate": 5.760675659406065e-07, "loss": 1.9688, "num_input_tokens_seen": 350160096, "step": 343900 }, { "epoch": 6.7728534582898545, "grad_norm": 1.919238805770874, "learning_rate": 5.758637789971635e-07, "loss": 2.0069, "num_input_tokens_seen": 350261160, "step": 344000 }, { "epoch": 6.774822311039357, "grad_norm": 1.9554997682571411, "learning_rate": 5.756599791537247e-07, "loss": 2.0035, "num_input_tokens_seen": 350361944, "step": 344100 }, { "epoch": 6.77679116378886, "grad_norm": 1.9567841291427612, "learning_rate": 5.754561664449444e-07, "loss": 1.9541, "num_input_tokens_seen": 350464344, "step": 344200 }, { "epoch": 6.778760016538363, "grad_norm": 1.673003077507019, "learning_rate": 5.752523409054796e-07, "loss": 2.0018, "num_input_tokens_seen": 350566144, "step": 344300 }, { "epoch": 6.7807288692878664, "grad_norm": 2.2396624088287354, "learning_rate": 5.750485025699889e-07, "loss": 2.0351, "num_input_tokens_seen": 350667904, "step": 344400 }, { "epoch": 6.782697722037369, "grad_norm": 2.2762765884399414, "learning_rate": 5.748446514731333e-07, "loss": 1.9764, "num_input_tokens_seen": 350769064, "step": 344500 }, { "epoch": 6.784666574786872, "grad_norm": 1.8437024354934692, "learning_rate": 5.74640787649576e-07, "loss": 1.9674, "num_input_tokens_seen": 350871184, "step": 344600 }, { "epoch": 6.786635427536375, "grad_norm": 1.8720802068710327, "learning_rate": 5.744369111339824e-07, "loss": 1.9963, "num_input_tokens_seen": 350973584, "step": 344700 }, { "epoch": 6.7886042802858775, "grad_norm": 3.721076726913452, "learning_rate": 5.742330219610197e-07, "loss": 2.0007, "num_input_tokens_seen": 351075984, "step": 344800 }, { "epoch": 6.79057313303538, "grad_norm": 2.1021668910980225, "learning_rate": 5.740291201653579e-07, "loss": 1.9819, "num_input_tokens_seen": 351178384, "step": 344900 }, { "epoch": 6.792541985784883, "grad_norm": 1.7631405591964722, "learning_rate": 5.738252057816684e-07, "loss": 1.9915, "num_input_tokens_seen": 351279120, "step": 345000 }, { "epoch": 6.794510838534386, "grad_norm": 1.955552339553833, "learning_rate": 5.736212788446257e-07, "loss": 1.9725, "num_input_tokens_seen": 351380592, "step": 345100 }, { "epoch": 6.7964796912838885, "grad_norm": 1.7008463144302368, "learning_rate": 5.734173393889051e-07, "loss": 2.03, "num_input_tokens_seen": 351482440, "step": 345200 }, { "epoch": 6.798448544033391, "grad_norm": 3.0899970531463623, "learning_rate": 5.732133874491853e-07, "loss": 2.0021, "num_input_tokens_seen": 351584840, "step": 345300 }, { "epoch": 6.800417396782895, "grad_norm": 1.8752034902572632, "learning_rate": 5.730094230601466e-07, "loss": 1.9792, "num_input_tokens_seen": 351687240, "step": 345400 }, { "epoch": 6.802386249532398, "grad_norm": 1.8636212348937988, "learning_rate": 5.728054462564711e-07, "loss": 1.9815, "num_input_tokens_seen": 351789640, "step": 345500 }, { "epoch": 6.8043551022819, "grad_norm": 1.7927242517471313, "learning_rate": 5.726014570728437e-07, "loss": 2.0082, "num_input_tokens_seen": 351892040, "step": 345600 }, { "epoch": 6.806323955031403, "grad_norm": 7.441971302032471, "learning_rate": 5.723974555439508e-07, "loss": 1.9791, "num_input_tokens_seen": 351993888, "step": 345700 }, { "epoch": 6.808292807780906, "grad_norm": 1.948264479637146, "learning_rate": 5.721934417044813e-07, "loss": 1.9926, "num_input_tokens_seen": 352096288, "step": 345800 }, { "epoch": 6.810261660530409, "grad_norm": 1.810975193977356, "learning_rate": 5.719894155891258e-07, "loss": 2.0123, "num_input_tokens_seen": 352198240, "step": 345900 }, { "epoch": 6.8122305132799115, "grad_norm": 1.8330130577087402, "learning_rate": 5.717853772325774e-07, "loss": 2.0488, "num_input_tokens_seen": 352300088, "step": 346000 }, { "epoch": 6.814199366029415, "grad_norm": 1.7867487668991089, "learning_rate": 5.71581326669531e-07, "loss": 2.0212, "num_input_tokens_seen": 352402488, "step": 346100 }, { "epoch": 6.816168218778918, "grad_norm": 1.878553867340088, "learning_rate": 5.713772639346839e-07, "loss": 2.006, "num_input_tokens_seen": 352503584, "step": 346200 }, { "epoch": 6.818137071528421, "grad_norm": 1.945835828781128, "learning_rate": 5.71173189062735e-07, "loss": 2.0214, "num_input_tokens_seen": 352605112, "step": 346300 }, { "epoch": 6.820105924277923, "grad_norm": 1.8979483842849731, "learning_rate": 5.709691020883856e-07, "loss": 1.9857, "num_input_tokens_seen": 352707512, "step": 346400 }, { "epoch": 6.822074777027426, "grad_norm": 1.9538261890411377, "learning_rate": 5.707650030463392e-07, "loss": 1.9477, "num_input_tokens_seen": 352809688, "step": 346500 }, { "epoch": 6.824043629776929, "grad_norm": 1.7969032526016235, "learning_rate": 5.705608919713006e-07, "loss": 1.9765, "num_input_tokens_seen": 352912088, "step": 346600 }, { "epoch": 6.826012482526432, "grad_norm": 3.273768186569214, "learning_rate": 5.703567688979776e-07, "loss": 1.9902, "num_input_tokens_seen": 353013616, "step": 346700 }, { "epoch": 6.827981335275934, "grad_norm": 2.1750895977020264, "learning_rate": 5.701526338610794e-07, "loss": 1.9447, "num_input_tokens_seen": 353116016, "step": 346800 }, { "epoch": 6.829950188025437, "grad_norm": 2.028327465057373, "learning_rate": 5.699484868953175e-07, "loss": 1.9477, "num_input_tokens_seen": 353218416, "step": 346900 }, { "epoch": 6.83191904077494, "grad_norm": 1.7221735715866089, "learning_rate": 5.697443280354055e-07, "loss": 1.9551, "num_input_tokens_seen": 353320312, "step": 347000 }, { "epoch": 6.833887893524444, "grad_norm": 2.1486196517944336, "learning_rate": 5.695401573160586e-07, "loss": 1.9697, "num_input_tokens_seen": 353422712, "step": 347100 }, { "epoch": 6.835856746273946, "grad_norm": 2.0063304901123047, "learning_rate": 5.693359747719946e-07, "loss": 1.9567, "num_input_tokens_seen": 353525112, "step": 347200 }, { "epoch": 6.837825599023449, "grad_norm": 2.1901392936706543, "learning_rate": 5.691317804379326e-07, "loss": 1.9624, "num_input_tokens_seen": 353627248, "step": 347300 }, { "epoch": 6.839794451772952, "grad_norm": 2.088301420211792, "learning_rate": 5.689275743485949e-07, "loss": 2.0254, "num_input_tokens_seen": 353729648, "step": 347400 }, { "epoch": 6.841763304522455, "grad_norm": 1.982779860496521, "learning_rate": 5.687233565387041e-07, "loss": 1.9797, "num_input_tokens_seen": 353831512, "step": 347500 }, { "epoch": 6.843732157271957, "grad_norm": 1.9566974639892578, "learning_rate": 5.685191270429866e-07, "loss": 2.0576, "num_input_tokens_seen": 353932304, "step": 347600 }, { "epoch": 6.84570101002146, "grad_norm": 2.1104142665863037, "learning_rate": 5.68314885896169e-07, "loss": 2.0094, "num_input_tokens_seen": 354033616, "step": 347700 }, { "epoch": 6.847669862770964, "grad_norm": 1.9614967107772827, "learning_rate": 5.681106331329817e-07, "loss": 2.01, "num_input_tokens_seen": 354136008, "step": 347800 }, { "epoch": 6.8496387155204665, "grad_norm": 2.188655138015747, "learning_rate": 5.679063687881554e-07, "loss": 1.9384, "num_input_tokens_seen": 354237600, "step": 347900 }, { "epoch": 6.851607568269969, "grad_norm": 2.3352487087249756, "learning_rate": 5.67702092896424e-07, "loss": 1.9875, "num_input_tokens_seen": 354338600, "step": 348000 }, { "epoch": 6.853576421019472, "grad_norm": 1.7377607822418213, "learning_rate": 5.674978054925226e-07, "loss": 2.0607, "num_input_tokens_seen": 354439248, "step": 348100 }, { "epoch": 6.855545273768975, "grad_norm": 1.6657297611236572, "learning_rate": 5.672935066111888e-07, "loss": 1.9402, "num_input_tokens_seen": 354540184, "step": 348200 }, { "epoch": 6.857514126518478, "grad_norm": 2.0258216857910156, "learning_rate": 5.670891962871616e-07, "loss": 1.9366, "num_input_tokens_seen": 354642584, "step": 348300 }, { "epoch": 6.85948297926798, "grad_norm": 1.7575098276138306, "learning_rate": 5.668848745551826e-07, "loss": 1.9895, "num_input_tokens_seen": 354744464, "step": 348400 }, { "epoch": 6.861451832017483, "grad_norm": 1.8955113887786865, "learning_rate": 5.666805414499947e-07, "loss": 2.0603, "num_input_tokens_seen": 354846864, "step": 348500 }, { "epoch": 6.863420684766986, "grad_norm": 2.3453078269958496, "learning_rate": 5.664761970063432e-07, "loss": 1.9852, "num_input_tokens_seen": 354948904, "step": 348600 }, { "epoch": 6.8653895375164895, "grad_norm": 1.8201344013214111, "learning_rate": 5.662718412589749e-07, "loss": 1.9727, "num_input_tokens_seen": 355051304, "step": 348700 }, { "epoch": 6.867358390265992, "grad_norm": 1.9629918336868286, "learning_rate": 5.660674742426393e-07, "loss": 1.9834, "num_input_tokens_seen": 355152888, "step": 348800 }, { "epoch": 6.869327243015495, "grad_norm": 1.9738911390304565, "learning_rate": 5.658630959920867e-07, "loss": 1.9556, "num_input_tokens_seen": 355255288, "step": 348900 }, { "epoch": 6.871296095764998, "grad_norm": 2.033052921295166, "learning_rate": 5.656587065420703e-07, "loss": 1.988, "num_input_tokens_seen": 355357136, "step": 349000 }, { "epoch": 6.8732649485145005, "grad_norm": 2.0622310638427734, "learning_rate": 5.654543059273446e-07, "loss": 1.9965, "num_input_tokens_seen": 355457976, "step": 349100 }, { "epoch": 6.875233801264003, "grad_norm": 1.981715202331543, "learning_rate": 5.652498941826662e-07, "loss": 1.9976, "num_input_tokens_seen": 355560376, "step": 349200 }, { "epoch": 6.877202654013506, "grad_norm": 1.9295616149902344, "learning_rate": 5.650454713427936e-07, "loss": 2.0092, "num_input_tokens_seen": 355662776, "step": 349300 }, { "epoch": 6.879171506763009, "grad_norm": 1.954331636428833, "learning_rate": 5.648410374424872e-07, "loss": 2.0167, "num_input_tokens_seen": 355764680, "step": 349400 }, { "epoch": 6.8811403595125125, "grad_norm": 3.1415114402770996, "learning_rate": 5.646365925165094e-07, "loss": 1.9845, "num_input_tokens_seen": 355866312, "step": 349500 }, { "epoch": 6.883109212262015, "grad_norm": 1.667959451675415, "learning_rate": 5.644321365996241e-07, "loss": 1.9968, "num_input_tokens_seen": 355966392, "step": 349600 }, { "epoch": 6.885078065011518, "grad_norm": 2.055408239364624, "learning_rate": 5.642276697265975e-07, "loss": 1.9822, "num_input_tokens_seen": 356067968, "step": 349700 }, { "epoch": 6.887046917761021, "grad_norm": 1.9663994312286377, "learning_rate": 5.640231919321974e-07, "loss": 1.9615, "num_input_tokens_seen": 356170368, "step": 349800 }, { "epoch": 6.8890157705105235, "grad_norm": 1.9548956155776978, "learning_rate": 5.638187032511935e-07, "loss": 1.9239, "num_input_tokens_seen": 356272768, "step": 349900 }, { "epoch": 6.890984623260026, "grad_norm": 2.039267063140869, "learning_rate": 5.636142037183574e-07, "loss": 1.9914, "num_input_tokens_seen": 356375168, "step": 350000 }, { "epoch": 6.892953476009529, "grad_norm": 1.8091191053390503, "learning_rate": 5.634096933684625e-07, "loss": 1.9485, "num_input_tokens_seen": 356477568, "step": 350100 }, { "epoch": 6.894922328759032, "grad_norm": 1.9435220956802368, "learning_rate": 5.632051722362838e-07, "loss": 1.9729, "num_input_tokens_seen": 356579120, "step": 350200 }, { "epoch": 6.8968911815085345, "grad_norm": 2.0470776557922363, "learning_rate": 5.630006403565989e-07, "loss": 1.9926, "num_input_tokens_seen": 356680896, "step": 350300 }, { "epoch": 6.898860034258038, "grad_norm": 1.8169724941253662, "learning_rate": 5.627960977641863e-07, "loss": 1.9603, "num_input_tokens_seen": 356783296, "step": 350400 }, { "epoch": 6.900828887007541, "grad_norm": 2.2178609371185303, "learning_rate": 5.625915444938271e-07, "loss": 2.0383, "num_input_tokens_seen": 356885128, "step": 350500 }, { "epoch": 6.902797739757044, "grad_norm": 1.917288899421692, "learning_rate": 5.623869805803038e-07, "loss": 2.0111, "num_input_tokens_seen": 356987392, "step": 350600 }, { "epoch": 6.9047665925065465, "grad_norm": 1.8889974355697632, "learning_rate": 5.621824060584005e-07, "loss": 2.0086, "num_input_tokens_seen": 357089792, "step": 350700 }, { "epoch": 6.906735445256049, "grad_norm": 1.655066728591919, "learning_rate": 5.619778209629034e-07, "loss": 1.9734, "num_input_tokens_seen": 357190952, "step": 350800 }, { "epoch": 6.908704298005552, "grad_norm": 1.8650329113006592, "learning_rate": 5.617732253286009e-07, "loss": 1.9931, "num_input_tokens_seen": 357293352, "step": 350900 }, { "epoch": 6.910673150755055, "grad_norm": 1.9264100790023804, "learning_rate": 5.615686191902822e-07, "loss": 1.954, "num_input_tokens_seen": 357395216, "step": 351000 }, { "epoch": 6.912642003504558, "grad_norm": 2.1717875003814697, "learning_rate": 5.613640025827392e-07, "loss": 2.0366, "num_input_tokens_seen": 357496904, "step": 351100 }, { "epoch": 6.914610856254061, "grad_norm": 1.8024847507476807, "learning_rate": 5.61159375540765e-07, "loss": 1.9804, "num_input_tokens_seen": 357598344, "step": 351200 }, { "epoch": 6.916579709003564, "grad_norm": 2.213294506072998, "learning_rate": 5.60954738099155e-07, "loss": 2.0139, "num_input_tokens_seen": 357700200, "step": 351300 }, { "epoch": 6.918548561753067, "grad_norm": 1.8208956718444824, "learning_rate": 5.607500902927057e-07, "loss": 2.0063, "num_input_tokens_seen": 357802600, "step": 351400 }, { "epoch": 6.920517414502569, "grad_norm": 1.8078587055206299, "learning_rate": 5.605454321562161e-07, "loss": 2.0143, "num_input_tokens_seen": 357905000, "step": 351500 }, { "epoch": 6.922486267252072, "grad_norm": 1.9979687929153442, "learning_rate": 5.603407637244864e-07, "loss": 1.9555, "num_input_tokens_seen": 358007400, "step": 351600 }, { "epoch": 6.924455120001575, "grad_norm": 1.9737186431884766, "learning_rate": 5.601360850323188e-07, "loss": 1.9822, "num_input_tokens_seen": 358109800, "step": 351700 }, { "epoch": 6.926423972751078, "grad_norm": 2.0724594593048096, "learning_rate": 5.59931396114517e-07, "loss": 1.9898, "num_input_tokens_seen": 358212200, "step": 351800 }, { "epoch": 6.92839282550058, "grad_norm": 2.0351979732513428, "learning_rate": 5.59726697005887e-07, "loss": 1.983, "num_input_tokens_seen": 358314600, "step": 351900 }, { "epoch": 6.930361678250083, "grad_norm": 1.9320485591888428, "learning_rate": 5.595219877412357e-07, "loss": 2.0139, "num_input_tokens_seen": 358415216, "step": 352000 }, { "epoch": 6.932330530999587, "grad_norm": 2.074334144592285, "learning_rate": 5.593172683553726e-07, "loss": 2.0202, "num_input_tokens_seen": 358517144, "step": 352100 }, { "epoch": 6.93429938374909, "grad_norm": 1.8210471868515015, "learning_rate": 5.591125388831083e-07, "loss": 1.9623, "num_input_tokens_seen": 358618752, "step": 352200 }, { "epoch": 6.936268236498592, "grad_norm": 1.9182939529418945, "learning_rate": 5.589077993592552e-07, "loss": 1.9568, "num_input_tokens_seen": 358720296, "step": 352300 }, { "epoch": 6.938237089248095, "grad_norm": 1.9479706287384033, "learning_rate": 5.587030498186277e-07, "loss": 2.0339, "num_input_tokens_seen": 358822200, "step": 352400 }, { "epoch": 6.940205941997598, "grad_norm": 1.7668222188949585, "learning_rate": 5.584982902960418e-07, "loss": 1.9545, "num_input_tokens_seen": 358924104, "step": 352500 }, { "epoch": 6.942174794747101, "grad_norm": 1.8232539892196655, "learning_rate": 5.58293520826315e-07, "loss": 2.0338, "num_input_tokens_seen": 359026024, "step": 352600 }, { "epoch": 6.944143647496603, "grad_norm": 1.8361282348632812, "learning_rate": 5.580887414442667e-07, "loss": 1.974, "num_input_tokens_seen": 359128112, "step": 352700 }, { "epoch": 6.946112500246107, "grad_norm": 1.7626744508743286, "learning_rate": 5.578839521847178e-07, "loss": 1.9398, "num_input_tokens_seen": 359230512, "step": 352800 }, { "epoch": 6.94808135299561, "grad_norm": 2.0083327293395996, "learning_rate": 5.576791530824911e-07, "loss": 1.9806, "num_input_tokens_seen": 359332912, "step": 352900 }, { "epoch": 6.950050205745113, "grad_norm": 2.031496524810791, "learning_rate": 5.574743441724108e-07, "loss": 1.9822, "num_input_tokens_seen": 359434016, "step": 353000 }, { "epoch": 6.952019058494615, "grad_norm": 2.3042380809783936, "learning_rate": 5.572695254893031e-07, "loss": 2.0024, "num_input_tokens_seen": 359536416, "step": 353100 }, { "epoch": 6.953987911244118, "grad_norm": 1.96910560131073, "learning_rate": 5.570646970679957e-07, "loss": 1.9881, "num_input_tokens_seen": 359638816, "step": 353200 }, { "epoch": 6.955956763993621, "grad_norm": 2.1909523010253906, "learning_rate": 5.568598589433178e-07, "loss": 1.9824, "num_input_tokens_seen": 359740496, "step": 353300 }, { "epoch": 6.957925616743124, "grad_norm": 1.9911549091339111, "learning_rate": 5.566550111501003e-07, "loss": 1.9757, "num_input_tokens_seen": 359842272, "step": 353400 }, { "epoch": 6.959894469492626, "grad_norm": 2.0698816776275635, "learning_rate": 5.564501537231763e-07, "loss": 1.9465, "num_input_tokens_seen": 359944672, "step": 353500 }, { "epoch": 6.961863322242129, "grad_norm": 2.081211566925049, "learning_rate": 5.562452866973797e-07, "loss": 1.9346, "num_input_tokens_seen": 360045192, "step": 353600 }, { "epoch": 6.963832174991633, "grad_norm": 1.645334243774414, "learning_rate": 5.560404101075463e-07, "loss": 2.0468, "num_input_tokens_seen": 360147592, "step": 353700 }, { "epoch": 6.9658010277411355, "grad_norm": 2.0219058990478516, "learning_rate": 5.558355239885141e-07, "loss": 1.9634, "num_input_tokens_seen": 360249384, "step": 353800 }, { "epoch": 6.967769880490638, "grad_norm": 1.7644826173782349, "learning_rate": 5.556306283751217e-07, "loss": 1.9686, "num_input_tokens_seen": 360350376, "step": 353900 }, { "epoch": 6.969738733240141, "grad_norm": 2.12752103805542, "learning_rate": 5.554257233022104e-07, "loss": 1.9833, "num_input_tokens_seen": 360452776, "step": 354000 }, { "epoch": 6.971707585989644, "grad_norm": 2.0328681468963623, "learning_rate": 5.552208088046222e-07, "loss": 2.0229, "num_input_tokens_seen": 360553616, "step": 354100 }, { "epoch": 6.9736764387391466, "grad_norm": 2.161731004714966, "learning_rate": 5.550158849172013e-07, "loss": 2.0487, "num_input_tokens_seen": 360655200, "step": 354200 }, { "epoch": 6.975645291488649, "grad_norm": 2.0961341857910156, "learning_rate": 5.548109516747932e-07, "loss": 1.9601, "num_input_tokens_seen": 360756976, "step": 354300 }, { "epoch": 6.977614144238152, "grad_norm": 1.967660903930664, "learning_rate": 5.546060091122448e-07, "loss": 1.9854, "num_input_tokens_seen": 360858088, "step": 354400 }, { "epoch": 6.979582996987656, "grad_norm": 1.7927228212356567, "learning_rate": 5.544010572644053e-07, "loss": 1.9871, "num_input_tokens_seen": 360959016, "step": 354500 }, { "epoch": 6.9815518497371585, "grad_norm": 2.0117151737213135, "learning_rate": 5.54196096166125e-07, "loss": 2.0087, "num_input_tokens_seen": 361061416, "step": 354600 }, { "epoch": 6.983520702486661, "grad_norm": 1.8553569316864014, "learning_rate": 5.539911258522555e-07, "loss": 1.952, "num_input_tokens_seen": 361162432, "step": 354700 }, { "epoch": 6.985489555236164, "grad_norm": 1.8479331731796265, "learning_rate": 5.537861463576505e-07, "loss": 1.9504, "num_input_tokens_seen": 361264208, "step": 354800 }, { "epoch": 6.987458407985667, "grad_norm": 1.9098297357559204, "learning_rate": 5.535811577171649e-07, "loss": 1.9919, "num_input_tokens_seen": 361366416, "step": 354900 }, { "epoch": 6.9894272607351695, "grad_norm": 1.964388132095337, "learning_rate": 5.533761599656556e-07, "loss": 1.9851, "num_input_tokens_seen": 361466832, "step": 355000 }, { "epoch": 6.991396113484672, "grad_norm": 2.710512399673462, "learning_rate": 5.531711531379801e-07, "loss": 1.9815, "num_input_tokens_seen": 361568632, "step": 355100 }, { "epoch": 6.993364966234175, "grad_norm": 1.9005932807922363, "learning_rate": 5.529661372689987e-07, "loss": 1.9938, "num_input_tokens_seen": 361670496, "step": 355200 }, { "epoch": 6.995333818983678, "grad_norm": 2.0018246173858643, "learning_rate": 5.527611123935724e-07, "loss": 2.0307, "num_input_tokens_seen": 361771968, "step": 355300 }, { "epoch": 6.997302671733181, "grad_norm": 2.3461527824401855, "learning_rate": 5.525560785465638e-07, "loss": 1.9936, "num_input_tokens_seen": 361872496, "step": 355400 }, { "epoch": 6.999271524482684, "grad_norm": 11.463159561157227, "learning_rate": 5.523510357628375e-07, "loss": 1.989, "num_input_tokens_seen": 361974896, "step": 355500 }, { "epoch": 7.001240377232187, "grad_norm": 2.019601821899414, "learning_rate": 5.521459840772591e-07, "loss": 2.018, "num_input_tokens_seen": 362077296, "step": 355600 }, { "epoch": 7.00320922998169, "grad_norm": 1.8182095289230347, "learning_rate": 5.519409235246958e-07, "loss": 2.0031, "num_input_tokens_seen": 362179696, "step": 355700 }, { "epoch": 7.0051780827311925, "grad_norm": 1.9885685443878174, "learning_rate": 5.517358541400166e-07, "loss": 1.9816, "num_input_tokens_seen": 362282096, "step": 355800 }, { "epoch": 7.007146935480695, "grad_norm": 1.6222323179244995, "learning_rate": 5.515307759580917e-07, "loss": 1.9919, "num_input_tokens_seen": 362383640, "step": 355900 }, { "epoch": 7.009115788230198, "grad_norm": 2.084733247756958, "learning_rate": 5.513256890137932e-07, "loss": 2.0344, "num_input_tokens_seen": 362485072, "step": 356000 }, { "epoch": 7.011084640979701, "grad_norm": 1.9736841917037964, "learning_rate": 5.511205933419941e-07, "loss": 1.9684, "num_input_tokens_seen": 362587472, "step": 356100 }, { "epoch": 7.013053493729204, "grad_norm": 1.8116730451583862, "learning_rate": 5.509154889775691e-07, "loss": 1.9543, "num_input_tokens_seen": 362689872, "step": 356200 }, { "epoch": 7.015022346478707, "grad_norm": 1.9676889181137085, "learning_rate": 5.507103759553947e-07, "loss": 2.0076, "num_input_tokens_seen": 362792272, "step": 356300 }, { "epoch": 7.01699119922821, "grad_norm": 1.8944008350372314, "learning_rate": 5.505052543103487e-07, "loss": 2.0034, "num_input_tokens_seen": 362893760, "step": 356400 }, { "epoch": 7.018960051977713, "grad_norm": 1.8716422319412231, "learning_rate": 5.503001240773099e-07, "loss": 1.9681, "num_input_tokens_seen": 362995512, "step": 356500 }, { "epoch": 7.020928904727215, "grad_norm": 2.067317008972168, "learning_rate": 5.500949852911596e-07, "loss": 2.0361, "num_input_tokens_seen": 363097352, "step": 356600 }, { "epoch": 7.022897757476718, "grad_norm": 1.9743908643722534, "learning_rate": 5.498898379867794e-07, "loss": 2.0205, "num_input_tokens_seen": 363199752, "step": 356700 }, { "epoch": 7.024866610226221, "grad_norm": 1.8057843446731567, "learning_rate": 5.496846821990532e-07, "loss": 1.98, "num_input_tokens_seen": 363302152, "step": 356800 }, { "epoch": 7.026835462975724, "grad_norm": 2.0450387001037598, "learning_rate": 5.494795179628658e-07, "loss": 1.9822, "num_input_tokens_seen": 363403064, "step": 356900 }, { "epoch": 7.028804315725227, "grad_norm": 1.7592406272888184, "learning_rate": 5.492743453131036e-07, "loss": 2.0479, "num_input_tokens_seen": 363504872, "step": 357000 }, { "epoch": 7.03077316847473, "grad_norm": 1.8680310249328613, "learning_rate": 5.490691642846545e-07, "loss": 1.9885, "num_input_tokens_seen": 363606808, "step": 357100 }, { "epoch": 7.032742021224233, "grad_norm": 1.6684812307357788, "learning_rate": 5.488639749124081e-07, "loss": 1.9712, "num_input_tokens_seen": 363707656, "step": 357200 }, { "epoch": 7.034710873973736, "grad_norm": 1.6753751039505005, "learning_rate": 5.486587772312547e-07, "loss": 1.9733, "num_input_tokens_seen": 363809576, "step": 357300 }, { "epoch": 7.036679726723238, "grad_norm": 2.127769708633423, "learning_rate": 5.484535712760867e-07, "loss": 2.0122, "num_input_tokens_seen": 363911056, "step": 357400 }, { "epoch": 7.038648579472741, "grad_norm": 1.8474689722061157, "learning_rate": 5.482483570817977e-07, "loss": 2.0123, "num_input_tokens_seen": 364012088, "step": 357500 }, { "epoch": 7.040617432222244, "grad_norm": 1.979695439338684, "learning_rate": 5.480431346832825e-07, "loss": 1.9571, "num_input_tokens_seen": 364114008, "step": 357600 }, { "epoch": 7.042586284971747, "grad_norm": 1.8507744073867798, "learning_rate": 5.478379041154374e-07, "loss": 2.0002, "num_input_tokens_seen": 364215568, "step": 357700 }, { "epoch": 7.044555137721249, "grad_norm": 1.8359742164611816, "learning_rate": 5.476326654131601e-07, "loss": 1.998, "num_input_tokens_seen": 364317968, "step": 357800 }, { "epoch": 7.046523990470753, "grad_norm": 2.740100860595703, "learning_rate": 5.4742741861135e-07, "loss": 1.9667, "num_input_tokens_seen": 364419264, "step": 357900 }, { "epoch": 7.048492843220256, "grad_norm": 1.944488763809204, "learning_rate": 5.472221637449073e-07, "loss": 1.9726, "num_input_tokens_seen": 364521664, "step": 358000 }, { "epoch": 7.050461695969759, "grad_norm": 2.1473124027252197, "learning_rate": 5.470169008487339e-07, "loss": 2.0125, "num_input_tokens_seen": 364622504, "step": 358100 }, { "epoch": 7.052430548719261, "grad_norm": 1.8698921203613281, "learning_rate": 5.468116299577331e-07, "loss": 1.9554, "num_input_tokens_seen": 364724224, "step": 358200 }, { "epoch": 7.054399401468764, "grad_norm": 1.8280047178268433, "learning_rate": 5.466063511068095e-07, "loss": 1.9793, "num_input_tokens_seen": 364825912, "step": 358300 }, { "epoch": 7.056368254218267, "grad_norm": 1.7389600276947021, "learning_rate": 5.464010643308689e-07, "loss": 1.9385, "num_input_tokens_seen": 364927736, "step": 358400 }, { "epoch": 7.05833710696777, "grad_norm": 2.045398235321045, "learning_rate": 5.461957696648186e-07, "loss": 1.9591, "num_input_tokens_seen": 365030136, "step": 358500 }, { "epoch": 7.060305959717272, "grad_norm": 1.9011130332946777, "learning_rate": 5.459904671435673e-07, "loss": 1.982, "num_input_tokens_seen": 365132088, "step": 358600 }, { "epoch": 7.062274812466776, "grad_norm": 1.9591118097305298, "learning_rate": 5.457851568020252e-07, "loss": 2.0108, "num_input_tokens_seen": 365233056, "step": 358700 }, { "epoch": 7.064243665216279, "grad_norm": 2.0697875022888184, "learning_rate": 5.455798386751033e-07, "loss": 1.9933, "num_input_tokens_seen": 365334920, "step": 358800 }, { "epoch": 7.0662125179657815, "grad_norm": 1.8353288173675537, "learning_rate": 5.453745127977142e-07, "loss": 1.9743, "num_input_tokens_seen": 365437320, "step": 358900 }, { "epoch": 7.068181370715284, "grad_norm": 1.771433711051941, "learning_rate": 5.451691792047719e-07, "loss": 1.9767, "num_input_tokens_seen": 365539152, "step": 359000 }, { "epoch": 7.070150223464787, "grad_norm": 1.89574134349823, "learning_rate": 5.449638379311917e-07, "loss": 2.0361, "num_input_tokens_seen": 365640624, "step": 359100 }, { "epoch": 7.07211907621429, "grad_norm": 1.7786600589752197, "learning_rate": 5.447584890118901e-07, "loss": 1.9738, "num_input_tokens_seen": 365743024, "step": 359200 }, { "epoch": 7.074087928963793, "grad_norm": 2.0453200340270996, "learning_rate": 5.445531324817849e-07, "loss": 1.9797, "num_input_tokens_seen": 365845424, "step": 359300 }, { "epoch": 7.076056781713295, "grad_norm": 1.8308055400848389, "learning_rate": 5.443477683757952e-07, "loss": 1.9798, "num_input_tokens_seen": 365946416, "step": 359400 }, { "epoch": 7.078025634462799, "grad_norm": 2.2811360359191895, "learning_rate": 5.441423967288416e-07, "loss": 1.9628, "num_input_tokens_seen": 366047360, "step": 359500 }, { "epoch": 7.079994487212302, "grad_norm": 2.0829670429229736, "learning_rate": 5.439370175758457e-07, "loss": 1.9381, "num_input_tokens_seen": 366149760, "step": 359600 }, { "epoch": 7.0819633399618045, "grad_norm": 2.0165088176727295, "learning_rate": 5.437316309517308e-07, "loss": 1.9481, "num_input_tokens_seen": 366252160, "step": 359700 }, { "epoch": 7.083932192711307, "grad_norm": 5.970108509063721, "learning_rate": 5.435262368914207e-07, "loss": 1.9574, "num_input_tokens_seen": 366352440, "step": 359800 }, { "epoch": 7.08590104546081, "grad_norm": 1.8079566955566406, "learning_rate": 5.433208354298413e-07, "loss": 1.9789, "num_input_tokens_seen": 366454840, "step": 359900 }, { "epoch": 7.087869898210313, "grad_norm": 1.9190999269485474, "learning_rate": 5.43115426601919e-07, "loss": 2.0061, "num_input_tokens_seen": 366553904, "step": 360000 }, { "epoch": 7.0898387509598155, "grad_norm": 1.6828067302703857, "learning_rate": 5.429100104425823e-07, "loss": 2.0108, "num_input_tokens_seen": 366655624, "step": 360100 }, { "epoch": 7.091807603709318, "grad_norm": 1.8801361322402954, "learning_rate": 5.427045869867601e-07, "loss": 1.971, "num_input_tokens_seen": 366757520, "step": 360200 }, { "epoch": 7.093776456458821, "grad_norm": 1.888176441192627, "learning_rate": 5.424991562693831e-07, "loss": 1.9798, "num_input_tokens_seen": 366859920, "step": 360300 }, { "epoch": 7.095745309208325, "grad_norm": 1.9752569198608398, "learning_rate": 5.422937183253828e-07, "loss": 2.05, "num_input_tokens_seen": 366961816, "step": 360400 }, { "epoch": 7.0977141619578275, "grad_norm": 3.1653244495391846, "learning_rate": 5.420882731896926e-07, "loss": 2.0098, "num_input_tokens_seen": 367063400, "step": 360500 }, { "epoch": 7.09968301470733, "grad_norm": 1.7968275547027588, "learning_rate": 5.418828208972465e-07, "loss": 1.9966, "num_input_tokens_seen": 367165800, "step": 360600 }, { "epoch": 7.101651867456833, "grad_norm": 2.205883264541626, "learning_rate": 5.416773614829802e-07, "loss": 1.9548, "num_input_tokens_seen": 367267584, "step": 360700 }, { "epoch": 7.103620720206336, "grad_norm": 2.1103649139404297, "learning_rate": 5.414718949818299e-07, "loss": 1.9843, "num_input_tokens_seen": 367369216, "step": 360800 }, { "epoch": 7.1055895729558385, "grad_norm": 1.872778296470642, "learning_rate": 5.412664214287337e-07, "loss": 1.9398, "num_input_tokens_seen": 367471616, "step": 360900 }, { "epoch": 7.107558425705341, "grad_norm": 1.6096558570861816, "learning_rate": 5.410609408586305e-07, "loss": 2.0362, "num_input_tokens_seen": 367571432, "step": 361000 }, { "epoch": 7.109527278454844, "grad_norm": 2.4558537006378174, "learning_rate": 5.408554533064607e-07, "loss": 1.948, "num_input_tokens_seen": 367673160, "step": 361100 }, { "epoch": 7.111496131204348, "grad_norm": 1.9940905570983887, "learning_rate": 5.406499588071657e-07, "loss": 1.9899, "num_input_tokens_seen": 367774728, "step": 361200 }, { "epoch": 7.11346498395385, "grad_norm": 2.367253541946411, "learning_rate": 5.404444573956881e-07, "loss": 1.977, "num_input_tokens_seen": 367876416, "step": 361300 }, { "epoch": 7.115433836703353, "grad_norm": 1.7973729372024536, "learning_rate": 5.402389491069714e-07, "loss": 1.9838, "num_input_tokens_seen": 367978816, "step": 361400 }, { "epoch": 7.117402689452856, "grad_norm": 1.813769817352295, "learning_rate": 5.400334339759611e-07, "loss": 2.0028, "num_input_tokens_seen": 368079560, "step": 361500 }, { "epoch": 7.119371542202359, "grad_norm": 1.7249197959899902, "learning_rate": 5.398279120376028e-07, "loss": 1.9971, "num_input_tokens_seen": 368181960, "step": 361600 }, { "epoch": 7.121340394951861, "grad_norm": 1.7796777486801147, "learning_rate": 5.39622383326844e-07, "loss": 1.9993, "num_input_tokens_seen": 368284360, "step": 361700 }, { "epoch": 7.123309247701364, "grad_norm": 1.9053800106048584, "learning_rate": 5.394168478786333e-07, "loss": 2.0051, "num_input_tokens_seen": 368386240, "step": 361800 }, { "epoch": 7.125278100450867, "grad_norm": 2.1554689407348633, "learning_rate": 5.392113057279199e-07, "loss": 2.0154, "num_input_tokens_seen": 368488640, "step": 361900 }, { "epoch": 7.12724695320037, "grad_norm": 2.1221604347229004, "learning_rate": 5.390057569096547e-07, "loss": 2.0187, "num_input_tokens_seen": 368590752, "step": 362000 }, { "epoch": 7.129215805949873, "grad_norm": 2.1467058658599854, "learning_rate": 5.388002014587895e-07, "loss": 1.9533, "num_input_tokens_seen": 368693152, "step": 362100 }, { "epoch": 7.131184658699376, "grad_norm": 1.7196701765060425, "learning_rate": 5.385946394102774e-07, "loss": 2.0007, "num_input_tokens_seen": 368795552, "step": 362200 }, { "epoch": 7.133153511448879, "grad_norm": 1.777214527130127, "learning_rate": 5.383890707990722e-07, "loss": 1.9714, "num_input_tokens_seen": 368897056, "step": 362300 }, { "epoch": 7.135122364198382, "grad_norm": 2.4955103397369385, "learning_rate": 5.381834956601296e-07, "loss": 2.0043, "num_input_tokens_seen": 368998624, "step": 362400 }, { "epoch": 7.137091216947884, "grad_norm": 2.059335708618164, "learning_rate": 5.379779140284054e-07, "loss": 2.024, "num_input_tokens_seen": 369100440, "step": 362500 }, { "epoch": 7.139060069697387, "grad_norm": 1.8319780826568604, "learning_rate": 5.377723259388572e-07, "loss": 1.9471, "num_input_tokens_seen": 369202840, "step": 362600 }, { "epoch": 7.14102892244689, "grad_norm": 3.3601717948913574, "learning_rate": 5.375667314264436e-07, "loss": 1.968, "num_input_tokens_seen": 369304720, "step": 362700 }, { "epoch": 7.142997775196393, "grad_norm": 1.861990213394165, "learning_rate": 5.373611305261243e-07, "loss": 1.9488, "num_input_tokens_seen": 369405672, "step": 362800 }, { "epoch": 7.144966627945896, "grad_norm": 1.8643226623535156, "learning_rate": 5.371555232728597e-07, "loss": 2.0228, "num_input_tokens_seen": 369507456, "step": 362900 }, { "epoch": 7.146935480695399, "grad_norm": 1.7534048557281494, "learning_rate": 5.369499097016119e-07, "loss": 1.9745, "num_input_tokens_seen": 369609304, "step": 363000 }, { "epoch": 7.148904333444902, "grad_norm": 2.1406805515289307, "learning_rate": 5.367442898473435e-07, "loss": 1.975, "num_input_tokens_seen": 369711064, "step": 363100 }, { "epoch": 7.150873186194405, "grad_norm": 1.6065541505813599, "learning_rate": 5.365386637450187e-07, "loss": 2.0399, "num_input_tokens_seen": 369812864, "step": 363200 }, { "epoch": 7.152842038943907, "grad_norm": 2.1287107467651367, "learning_rate": 5.363330314296022e-07, "loss": 1.9809, "num_input_tokens_seen": 369914504, "step": 363300 }, { "epoch": 7.15481089169341, "grad_norm": 2.327932834625244, "learning_rate": 5.361273929360601e-07, "loss": 2.0227, "num_input_tokens_seen": 370016904, "step": 363400 }, { "epoch": 7.156779744442913, "grad_norm": 1.8001776933670044, "learning_rate": 5.359217482993596e-07, "loss": 1.9704, "num_input_tokens_seen": 370119304, "step": 363500 }, { "epoch": 7.158748597192416, "grad_norm": 1.7063366174697876, "learning_rate": 5.357160975544687e-07, "loss": 1.9766, "num_input_tokens_seen": 370221704, "step": 363600 }, { "epoch": 7.160717449941918, "grad_norm": 1.834115982055664, "learning_rate": 5.355104407363566e-07, "loss": 1.9631, "num_input_tokens_seen": 370323464, "step": 363700 }, { "epoch": 7.162686302691422, "grad_norm": 2.496907949447632, "learning_rate": 5.353047778799938e-07, "loss": 1.9185, "num_input_tokens_seen": 370425232, "step": 363800 }, { "epoch": 7.164655155440925, "grad_norm": 1.6020030975341797, "learning_rate": 5.350991090203511e-07, "loss": 2.0283, "num_input_tokens_seen": 370527632, "step": 363900 }, { "epoch": 7.1666240081904276, "grad_norm": 2.2522361278533936, "learning_rate": 5.348934341924011e-07, "loss": 1.9769, "num_input_tokens_seen": 370628736, "step": 364000 }, { "epoch": 7.16859286093993, "grad_norm": 1.9184983968734741, "learning_rate": 5.346877534311169e-07, "loss": 2.0164, "num_input_tokens_seen": 370730432, "step": 364100 }, { "epoch": 7.170561713689433, "grad_norm": 1.8637442588806152, "learning_rate": 5.344820667714727e-07, "loss": 1.9886, "num_input_tokens_seen": 370832832, "step": 364200 }, { "epoch": 7.172530566438936, "grad_norm": 1.794509768486023, "learning_rate": 5.342763742484439e-07, "loss": 2.0217, "num_input_tokens_seen": 370934408, "step": 364300 }, { "epoch": 7.174499419188439, "grad_norm": 1.8493796586990356, "learning_rate": 5.340706758970069e-07, "loss": 1.9404, "num_input_tokens_seen": 371036352, "step": 364400 }, { "epoch": 7.176468271937941, "grad_norm": 2.63429856300354, "learning_rate": 5.338649717521387e-07, "loss": 1.9922, "num_input_tokens_seen": 371137976, "step": 364500 }, { "epoch": 7.178437124687445, "grad_norm": 2.149338722229004, "learning_rate": 5.336592618488176e-07, "loss": 1.9469, "num_input_tokens_seen": 371240376, "step": 364600 }, { "epoch": 7.180405977436948, "grad_norm": 1.8350015878677368, "learning_rate": 5.33453546222023e-07, "loss": 1.9582, "num_input_tokens_seen": 371342776, "step": 364700 }, { "epoch": 7.1823748301864505, "grad_norm": 2.76448655128479, "learning_rate": 5.33247824906735e-07, "loss": 2.0503, "num_input_tokens_seen": 371445176, "step": 364800 }, { "epoch": 7.184343682935953, "grad_norm": 4.350588321685791, "learning_rate": 5.330420979379349e-07, "loss": 1.9721, "num_input_tokens_seen": 371546968, "step": 364900 }, { "epoch": 7.186312535685456, "grad_norm": 1.8414084911346436, "learning_rate": 5.328363653506047e-07, "loss": 1.9958, "num_input_tokens_seen": 371649368, "step": 365000 }, { "epoch": 7.188281388434959, "grad_norm": 1.9045623540878296, "learning_rate": 5.326306271797274e-07, "loss": 2.0003, "num_input_tokens_seen": 371751400, "step": 365100 }, { "epoch": 7.1902502411844615, "grad_norm": 1.8278273344039917, "learning_rate": 5.324248834602874e-07, "loss": 2.0071, "num_input_tokens_seen": 371853800, "step": 365200 }, { "epoch": 7.192219093933964, "grad_norm": 1.9607822895050049, "learning_rate": 5.322191342272692e-07, "loss": 1.9618, "num_input_tokens_seen": 371955704, "step": 365300 }, { "epoch": 7.194187946683468, "grad_norm": 2.0386648178100586, "learning_rate": 5.320133795156591e-07, "loss": 1.9965, "num_input_tokens_seen": 372058104, "step": 365400 }, { "epoch": 7.196156799432971, "grad_norm": 1.765509009361267, "learning_rate": 5.318076193604439e-07, "loss": 1.9829, "num_input_tokens_seen": 372160504, "step": 365500 }, { "epoch": 7.1981256521824735, "grad_norm": 2.096997022628784, "learning_rate": 5.316018537966112e-07, "loss": 2.0115, "num_input_tokens_seen": 372262128, "step": 365600 }, { "epoch": 7.200094504931976, "grad_norm": 1.8946949243545532, "learning_rate": 5.313960828591498e-07, "loss": 1.9878, "num_input_tokens_seen": 372364528, "step": 365700 }, { "epoch": 7.202063357681479, "grad_norm": 1.7764822244644165, "learning_rate": 5.311903065830494e-07, "loss": 1.9765, "num_input_tokens_seen": 372466584, "step": 365800 }, { "epoch": 7.204032210430982, "grad_norm": 19.783960342407227, "learning_rate": 5.309845250033004e-07, "loss": 2.0381, "num_input_tokens_seen": 372564808, "step": 365900 }, { "epoch": 7.2060010631804845, "grad_norm": 16.93162727355957, "learning_rate": 5.307787381548943e-07, "loss": 1.9692, "num_input_tokens_seen": 372665648, "step": 366000 }, { "epoch": 7.207969915929987, "grad_norm": 2.239443778991699, "learning_rate": 5.305729460728233e-07, "loss": 2.0435, "num_input_tokens_seen": 372767024, "step": 366100 }, { "epoch": 7.209938768679491, "grad_norm": 1.760794758796692, "learning_rate": 5.303671487920807e-07, "loss": 1.9828, "num_input_tokens_seen": 372868616, "step": 366200 }, { "epoch": 7.211907621428994, "grad_norm": 1.9899064302444458, "learning_rate": 5.301613463476606e-07, "loss": 2.0166, "num_input_tokens_seen": 372969136, "step": 366300 }, { "epoch": 7.213876474178496, "grad_norm": 1.6896626949310303, "learning_rate": 5.299555387745579e-07, "loss": 1.9703, "num_input_tokens_seen": 373070784, "step": 366400 }, { "epoch": 7.215845326927999, "grad_norm": 2.2399237155914307, "learning_rate": 5.297497261077686e-07, "loss": 1.9703, "num_input_tokens_seen": 373173184, "step": 366500 }, { "epoch": 7.217814179677502, "grad_norm": 2.2764811515808105, "learning_rate": 5.295439083822891e-07, "loss": 2.0189, "num_input_tokens_seen": 373274960, "step": 366600 }, { "epoch": 7.219783032427005, "grad_norm": 3.2627921104431152, "learning_rate": 5.293380856331173e-07, "loss": 1.9366, "num_input_tokens_seen": 373377360, "step": 366700 }, { "epoch": 7.2217518851765075, "grad_norm": 2.1788229942321777, "learning_rate": 5.291322578952514e-07, "loss": 2.0299, "num_input_tokens_seen": 373479760, "step": 366800 }, { "epoch": 7.22372073792601, "grad_norm": 2.0692787170410156, "learning_rate": 5.28926425203691e-07, "loss": 2.0199, "num_input_tokens_seen": 373582160, "step": 366900 }, { "epoch": 7.225689590675513, "grad_norm": 2.1803781986236572, "learning_rate": 5.287205875934357e-07, "loss": 2.0034, "num_input_tokens_seen": 373683944, "step": 367000 }, { "epoch": 7.227658443425017, "grad_norm": 1.8231624364852905, "learning_rate": 5.285147450994868e-07, "loss": 1.9944, "num_input_tokens_seen": 373786344, "step": 367100 }, { "epoch": 7.229627296174519, "grad_norm": 2.024059534072876, "learning_rate": 5.283088977568459e-07, "loss": 1.9888, "num_input_tokens_seen": 373887952, "step": 367200 }, { "epoch": 7.231596148924022, "grad_norm": 1.960951566696167, "learning_rate": 5.281030456005159e-07, "loss": 1.9572, "num_input_tokens_seen": 373990352, "step": 367300 }, { "epoch": 7.233565001673525, "grad_norm": 1.7448127269744873, "learning_rate": 5.278971886654999e-07, "loss": 1.9775, "num_input_tokens_seen": 374091760, "step": 367400 }, { "epoch": 7.235533854423028, "grad_norm": 1.9026989936828613, "learning_rate": 5.276913269868024e-07, "loss": 1.982, "num_input_tokens_seen": 374193528, "step": 367500 }, { "epoch": 7.23750270717253, "grad_norm": 2.157411575317383, "learning_rate": 5.274854605994281e-07, "loss": 1.9915, "num_input_tokens_seen": 374294808, "step": 367600 }, { "epoch": 7.239471559922033, "grad_norm": 2.068803310394287, "learning_rate": 5.272795895383831e-07, "loss": 1.9947, "num_input_tokens_seen": 374396448, "step": 367700 }, { "epoch": 7.241440412671536, "grad_norm": 2.757269859313965, "learning_rate": 5.27073713838674e-07, "loss": 1.9894, "num_input_tokens_seen": 374498288, "step": 367800 }, { "epoch": 7.24340926542104, "grad_norm": 1.7236621379852295, "learning_rate": 5.268678335353084e-07, "loss": 1.9905, "num_input_tokens_seen": 374599984, "step": 367900 }, { "epoch": 7.245378118170542, "grad_norm": 2.0685951709747314, "learning_rate": 5.266619486632941e-07, "loss": 1.9732, "num_input_tokens_seen": 374702384, "step": 368000 }, { "epoch": 7.247346970920045, "grad_norm": 1.782249093055725, "learning_rate": 5.264560592576404e-07, "loss": 1.9993, "num_input_tokens_seen": 374804784, "step": 368100 }, { "epoch": 7.249315823669548, "grad_norm": 2.0508511066436768, "learning_rate": 5.26250165353357e-07, "loss": 2.0149, "num_input_tokens_seen": 374906256, "step": 368200 }, { "epoch": 7.251284676419051, "grad_norm": 1.8285428285598755, "learning_rate": 5.260442669854544e-07, "loss": 1.958, "num_input_tokens_seen": 375008656, "step": 368300 }, { "epoch": 7.253253529168553, "grad_norm": 1.7665964365005493, "learning_rate": 5.258383641889438e-07, "loss": 1.9831, "num_input_tokens_seen": 375109680, "step": 368400 }, { "epoch": 7.255222381918056, "grad_norm": 1.7450065612792969, "learning_rate": 5.256324569988373e-07, "loss": 1.988, "num_input_tokens_seen": 375212080, "step": 368500 }, { "epoch": 7.257191234667559, "grad_norm": 1.9241122007369995, "learning_rate": 5.254265454501476e-07, "loss": 1.9472, "num_input_tokens_seen": 375314000, "step": 368600 }, { "epoch": 7.259160087417062, "grad_norm": 1.618250846862793, "learning_rate": 5.252206295778884e-07, "loss": 1.983, "num_input_tokens_seen": 375415704, "step": 368700 }, { "epoch": 7.261128940166565, "grad_norm": 2.255683422088623, "learning_rate": 5.250147094170738e-07, "loss": 2.0033, "num_input_tokens_seen": 375518104, "step": 368800 }, { "epoch": 7.263097792916068, "grad_norm": 1.918139100074768, "learning_rate": 5.24808785002719e-07, "loss": 1.9646, "num_input_tokens_seen": 375620504, "step": 368900 }, { "epoch": 7.265066645665571, "grad_norm": 2.014235496520996, "learning_rate": 5.246028563698394e-07, "loss": 1.9885, "num_input_tokens_seen": 375722712, "step": 369000 }, { "epoch": 7.267035498415074, "grad_norm": 2.120821714401245, "learning_rate": 5.243969235534517e-07, "loss": 1.9718, "num_input_tokens_seen": 375824536, "step": 369100 }, { "epoch": 7.269004351164576, "grad_norm": 1.9699757099151611, "learning_rate": 5.241909865885728e-07, "loss": 1.9586, "num_input_tokens_seen": 375926936, "step": 369200 }, { "epoch": 7.270973203914079, "grad_norm": 1.9826579093933105, "learning_rate": 5.239850455102208e-07, "loss": 1.9874, "num_input_tokens_seen": 376028128, "step": 369300 }, { "epoch": 7.272942056663582, "grad_norm": 1.8360637426376343, "learning_rate": 5.23779100353414e-07, "loss": 1.9952, "num_input_tokens_seen": 376129192, "step": 369400 }, { "epoch": 7.274910909413085, "grad_norm": 1.8102113008499146, "learning_rate": 5.235731511531718e-07, "loss": 1.9623, "num_input_tokens_seen": 376231592, "step": 369500 }, { "epoch": 7.276879762162588, "grad_norm": 1.8897552490234375, "learning_rate": 5.23367197944514e-07, "loss": 2.0113, "num_input_tokens_seen": 376333552, "step": 369600 }, { "epoch": 7.278848614912091, "grad_norm": 2.0573432445526123, "learning_rate": 5.231612407624614e-07, "loss": 1.9529, "num_input_tokens_seen": 376435672, "step": 369700 }, { "epoch": 7.280817467661594, "grad_norm": 1.9561767578125, "learning_rate": 5.229552796420351e-07, "loss": 2.0072, "num_input_tokens_seen": 376536784, "step": 369800 }, { "epoch": 7.2827863204110965, "grad_norm": 1.959929347038269, "learning_rate": 5.227493146182571e-07, "loss": 2.0258, "num_input_tokens_seen": 376637344, "step": 369900 }, { "epoch": 7.284755173160599, "grad_norm": 2.0031423568725586, "learning_rate": 5.225433457261501e-07, "loss": 1.9895, "num_input_tokens_seen": 376738952, "step": 370000 }, { "epoch": 7.286724025910102, "grad_norm": 2.1231939792633057, "learning_rate": 5.223373730007371e-07, "loss": 1.9619, "num_input_tokens_seen": 376840224, "step": 370100 }, { "epoch": 7.288692878659605, "grad_norm": 2.225644826889038, "learning_rate": 5.221313964770424e-07, "loss": 1.9698, "num_input_tokens_seen": 376941584, "step": 370200 }, { "epoch": 7.290661731409108, "grad_norm": 2.6365232467651367, "learning_rate": 5.219254161900903e-07, "loss": 1.9748, "num_input_tokens_seen": 377043984, "step": 370300 }, { "epoch": 7.29263058415861, "grad_norm": 1.9064011573791504, "learning_rate": 5.217194321749064e-07, "loss": 1.9599, "num_input_tokens_seen": 377146384, "step": 370400 }, { "epoch": 7.294599436908114, "grad_norm": 2.127610921859741, "learning_rate": 5.215134444665157e-07, "loss": 2.0045, "num_input_tokens_seen": 377248784, "step": 370500 }, { "epoch": 7.296568289657617, "grad_norm": 1.8829401731491089, "learning_rate": 5.213074530999457e-07, "loss": 1.9863, "num_input_tokens_seen": 377349416, "step": 370600 }, { "epoch": 7.2985371424071195, "grad_norm": 2.1090333461761475, "learning_rate": 5.211014581102227e-07, "loss": 1.9816, "num_input_tokens_seen": 377450296, "step": 370700 }, { "epoch": 7.300505995156622, "grad_norm": 1.9458856582641602, "learning_rate": 5.208954595323749e-07, "loss": 2.0244, "num_input_tokens_seen": 377550640, "step": 370800 }, { "epoch": 7.302474847906125, "grad_norm": 2.082719326019287, "learning_rate": 5.206894574014303e-07, "loss": 1.9991, "num_input_tokens_seen": 377652896, "step": 370900 }, { "epoch": 7.304443700655628, "grad_norm": 16.064754486083984, "learning_rate": 5.204834517524182e-07, "loss": 1.9665, "num_input_tokens_seen": 377753784, "step": 371000 }, { "epoch": 7.3064125534051305, "grad_norm": 1.8369344472885132, "learning_rate": 5.202774426203678e-07, "loss": 1.9223, "num_input_tokens_seen": 377856184, "step": 371100 }, { "epoch": 7.308381406154634, "grad_norm": 1.8641718626022339, "learning_rate": 5.200714300403093e-07, "loss": 1.9927, "num_input_tokens_seen": 377957056, "step": 371200 }, { "epoch": 7.310350258904137, "grad_norm": 1.9015660285949707, "learning_rate": 5.198654140472733e-07, "loss": 1.9718, "num_input_tokens_seen": 378059456, "step": 371300 }, { "epoch": 7.31231911165364, "grad_norm": 1.9375883340835571, "learning_rate": 5.196593946762914e-07, "loss": 1.9936, "num_input_tokens_seen": 378161320, "step": 371400 }, { "epoch": 7.314287964403142, "grad_norm": 1.999479055404663, "learning_rate": 5.194533719623951e-07, "loss": 2.0122, "num_input_tokens_seen": 378262448, "step": 371500 }, { "epoch": 7.316256817152645, "grad_norm": 1.7307672500610352, "learning_rate": 5.192473459406171e-07, "loss": 1.9676, "num_input_tokens_seen": 378364848, "step": 371600 }, { "epoch": 7.318225669902148, "grad_norm": 2.3091683387756348, "learning_rate": 5.190413166459901e-07, "loss": 1.9577, "num_input_tokens_seen": 378466480, "step": 371700 }, { "epoch": 7.320194522651651, "grad_norm": 2.060296058654785, "learning_rate": 5.188352841135478e-07, "loss": 1.9613, "num_input_tokens_seen": 378568256, "step": 371800 }, { "epoch": 7.3221633754011535, "grad_norm": 1.7498630285263062, "learning_rate": 5.186292483783244e-07, "loss": 1.9721, "num_input_tokens_seen": 378669696, "step": 371900 }, { "epoch": 7.324132228150656, "grad_norm": 2.043344736099243, "learning_rate": 5.184232094753545e-07, "loss": 2.0076, "num_input_tokens_seen": 378772096, "step": 372000 }, { "epoch": 7.32610108090016, "grad_norm": 1.6896330118179321, "learning_rate": 5.182171674396732e-07, "loss": 1.9804, "num_input_tokens_seen": 378873680, "step": 372100 }, { "epoch": 7.328069933649663, "grad_norm": 1.9267531633377075, "learning_rate": 5.180111223063162e-07, "loss": 1.9435, "num_input_tokens_seen": 378976080, "step": 372200 }, { "epoch": 7.330038786399165, "grad_norm": 1.8462328910827637, "learning_rate": 5.178050741103196e-07, "loss": 1.9769, "num_input_tokens_seen": 379077664, "step": 372300 }, { "epoch": 7.332007639148668, "grad_norm": 1.8400561809539795, "learning_rate": 5.175990228867205e-07, "loss": 1.9737, "num_input_tokens_seen": 379179144, "step": 372400 }, { "epoch": 7.333976491898171, "grad_norm": 3.141594886779785, "learning_rate": 5.173929686705559e-07, "loss": 1.9448, "num_input_tokens_seen": 379281544, "step": 372500 }, { "epoch": 7.335945344647674, "grad_norm": 2.2036776542663574, "learning_rate": 5.171869114968639e-07, "loss": 2.0001, "num_input_tokens_seen": 379382264, "step": 372600 }, { "epoch": 7.337914197397176, "grad_norm": 2.153395414352417, "learning_rate": 5.169808514006822e-07, "loss": 1.983, "num_input_tokens_seen": 379484168, "step": 372700 }, { "epoch": 7.339883050146679, "grad_norm": 2.3387227058410645, "learning_rate": 5.167747884170502e-07, "loss": 1.9849, "num_input_tokens_seen": 379586568, "step": 372800 }, { "epoch": 7.341851902896183, "grad_norm": 2.0397276878356934, "learning_rate": 5.165687225810068e-07, "loss": 2.0222, "num_input_tokens_seen": 379688032, "step": 372900 }, { "epoch": 7.343820755645686, "grad_norm": 1.8556203842163086, "learning_rate": 5.163626539275919e-07, "loss": 2.0037, "num_input_tokens_seen": 379789992, "step": 373000 }, { "epoch": 7.345789608395188, "grad_norm": 1.7890570163726807, "learning_rate": 5.161565824918457e-07, "loss": 2.0557, "num_input_tokens_seen": 379891664, "step": 373100 }, { "epoch": 7.347758461144691, "grad_norm": 1.9766457080841064, "learning_rate": 5.15950508308809e-07, "loss": 2.013, "num_input_tokens_seen": 379992776, "step": 373200 }, { "epoch": 7.349727313894194, "grad_norm": 1.9591894149780273, "learning_rate": 5.157444314135229e-07, "loss": 1.9999, "num_input_tokens_seen": 380093768, "step": 373300 }, { "epoch": 7.351696166643697, "grad_norm": 2.112759828567505, "learning_rate": 5.15538351841029e-07, "loss": 1.9896, "num_input_tokens_seen": 380195320, "step": 373400 }, { "epoch": 7.353665019393199, "grad_norm": 2.1614561080932617, "learning_rate": 5.153322696263693e-07, "loss": 2.0226, "num_input_tokens_seen": 380296872, "step": 373500 }, { "epoch": 7.355633872142702, "grad_norm": 1.9520407915115356, "learning_rate": 5.151261848045867e-07, "loss": 1.9622, "num_input_tokens_seen": 380399272, "step": 373600 }, { "epoch": 7.357602724892205, "grad_norm": 2.2021682262420654, "learning_rate": 5.149200974107237e-07, "loss": 1.9745, "num_input_tokens_seen": 380499280, "step": 373700 }, { "epoch": 7.3595715776417086, "grad_norm": 2.7257330417633057, "learning_rate": 5.147140074798242e-07, "loss": 1.9719, "num_input_tokens_seen": 380601680, "step": 373800 }, { "epoch": 7.361540430391211, "grad_norm": 14.789177894592285, "learning_rate": 5.145079150469317e-07, "loss": 1.9557, "num_input_tokens_seen": 380704080, "step": 373900 }, { "epoch": 7.363509283140714, "grad_norm": 1.8463948965072632, "learning_rate": 5.143018201470907e-07, "loss": 2.0062, "num_input_tokens_seen": 380806480, "step": 374000 }, { "epoch": 7.365478135890217, "grad_norm": 1.9206358194351196, "learning_rate": 5.140957228153458e-07, "loss": 1.9592, "num_input_tokens_seen": 380908880, "step": 374100 }, { "epoch": 7.36744698863972, "grad_norm": 1.9654873609542847, "learning_rate": 5.13889623086742e-07, "loss": 2.0127, "num_input_tokens_seen": 381010488, "step": 374200 }, { "epoch": 7.369415841389222, "grad_norm": 1.9551275968551636, "learning_rate": 5.13683520996325e-07, "loss": 1.9735, "num_input_tokens_seen": 381112880, "step": 374300 }, { "epoch": 7.371384694138725, "grad_norm": 2.2033655643463135, "learning_rate": 5.134774165791406e-07, "loss": 1.9619, "num_input_tokens_seen": 381214616, "step": 374400 }, { "epoch": 7.373353546888228, "grad_norm": 6.106286525726318, "learning_rate": 5.132713098702354e-07, "loss": 1.9567, "num_input_tokens_seen": 381316440, "step": 374500 }, { "epoch": 7.3753223996377315, "grad_norm": 4.4674072265625, "learning_rate": 5.130652009046557e-07, "loss": 1.9488, "num_input_tokens_seen": 381418840, "step": 374600 }, { "epoch": 7.377291252387234, "grad_norm": 1.7868443727493286, "learning_rate": 5.12859089717449e-07, "loss": 1.9884, "num_input_tokens_seen": 381520672, "step": 374700 }, { "epoch": 7.379260105136737, "grad_norm": 1.8189754486083984, "learning_rate": 5.126529763436622e-07, "loss": 1.9947, "num_input_tokens_seen": 381623072, "step": 374800 }, { "epoch": 7.38122895788624, "grad_norm": 2.129924774169922, "learning_rate": 5.124468608183437e-07, "loss": 1.9858, "num_input_tokens_seen": 381725472, "step": 374900 }, { "epoch": 7.3831978106357425, "grad_norm": 2.500575304031372, "learning_rate": 5.122407431765415e-07, "loss": 1.9967, "num_input_tokens_seen": 381826600, "step": 375000 }, { "epoch": 7.385166663385245, "grad_norm": 1.8880819082260132, "learning_rate": 5.120346234533042e-07, "loss": 1.9229, "num_input_tokens_seen": 381929000, "step": 375100 }, { "epoch": 7.387135516134748, "grad_norm": 1.848900318145752, "learning_rate": 5.118285016836806e-07, "loss": 1.983, "num_input_tokens_seen": 382031400, "step": 375200 }, { "epoch": 7.389104368884251, "grad_norm": 1.8943896293640137, "learning_rate": 5.116223779027202e-07, "loss": 1.9336, "num_input_tokens_seen": 382133800, "step": 375300 }, { "epoch": 7.391073221633754, "grad_norm": 1.9485511779785156, "learning_rate": 5.114162521454724e-07, "loss": 1.9777, "num_input_tokens_seen": 382236200, "step": 375400 }, { "epoch": 7.393042074383257, "grad_norm": 1.9208974838256836, "learning_rate": 5.112101244469872e-07, "loss": 1.9951, "num_input_tokens_seen": 382338600, "step": 375500 }, { "epoch": 7.39501092713276, "grad_norm": 1.8350746631622314, "learning_rate": 5.110039948423148e-07, "loss": 1.9833, "num_input_tokens_seen": 382439760, "step": 375600 }, { "epoch": 7.396979779882263, "grad_norm": 1.8583351373672485, "learning_rate": 5.107978633665061e-07, "loss": 2.0236, "num_input_tokens_seen": 382541456, "step": 375700 }, { "epoch": 7.3989486326317655, "grad_norm": 2.133082389831543, "learning_rate": 5.105917300546114e-07, "loss": 1.9868, "num_input_tokens_seen": 382643720, "step": 375800 }, { "epoch": 7.400917485381268, "grad_norm": 1.9567073583602905, "learning_rate": 5.103855949416828e-07, "loss": 2.014, "num_input_tokens_seen": 382745352, "step": 375900 }, { "epoch": 7.402886338130771, "grad_norm": 2.0236613750457764, "learning_rate": 5.101794580627711e-07, "loss": 1.9616, "num_input_tokens_seen": 382847752, "step": 376000 }, { "epoch": 7.404855190880274, "grad_norm": 1.8273890018463135, "learning_rate": 5.099733194529285e-07, "loss": 1.988, "num_input_tokens_seen": 382949488, "step": 376100 }, { "epoch": 7.4068240436297765, "grad_norm": 1.6872155666351318, "learning_rate": 5.09767179147207e-07, "loss": 1.9586, "num_input_tokens_seen": 383051888, "step": 376200 }, { "epoch": 7.40879289637928, "grad_norm": 1.778103232383728, "learning_rate": 5.095610371806593e-07, "loss": 2.0053, "num_input_tokens_seen": 383152944, "step": 376300 }, { "epoch": 7.410761749128783, "grad_norm": 2.087170124053955, "learning_rate": 5.093548935883374e-07, "loss": 2.0005, "num_input_tokens_seen": 383255344, "step": 376400 }, { "epoch": 7.412730601878286, "grad_norm": 1.8912262916564941, "learning_rate": 5.091487484052952e-07, "loss": 2.0023, "num_input_tokens_seen": 383357744, "step": 376500 }, { "epoch": 7.4146994546277885, "grad_norm": 1.964666485786438, "learning_rate": 5.089426016665854e-07, "loss": 1.9713, "num_input_tokens_seen": 383458144, "step": 376600 }, { "epoch": 7.416668307377291, "grad_norm": 1.9928014278411865, "learning_rate": 5.087364534072615e-07, "loss": 1.9759, "num_input_tokens_seen": 383560544, "step": 376700 }, { "epoch": 7.418637160126794, "grad_norm": 1.9334039688110352, "learning_rate": 5.085303036623773e-07, "loss": 2.0017, "num_input_tokens_seen": 383662720, "step": 376800 }, { "epoch": 7.420606012876297, "grad_norm": 2.05747389793396, "learning_rate": 5.08324152466987e-07, "loss": 2.0033, "num_input_tokens_seen": 383764568, "step": 376900 }, { "epoch": 7.4225748656257995, "grad_norm": 1.861793041229248, "learning_rate": 5.081179998561448e-07, "loss": 1.9831, "num_input_tokens_seen": 383866968, "step": 377000 }, { "epoch": 7.424543718375302, "grad_norm": 1.9125750064849854, "learning_rate": 5.079118458649053e-07, "loss": 2.0068, "num_input_tokens_seen": 383968736, "step": 377100 }, { "epoch": 7.426512571124806, "grad_norm": 2.0038869380950928, "learning_rate": 5.077056905283231e-07, "loss": 1.991, "num_input_tokens_seen": 384071136, "step": 377200 }, { "epoch": 7.428481423874309, "grad_norm": 1.9794186353683472, "learning_rate": 5.074995338814531e-07, "loss": 1.9528, "num_input_tokens_seen": 384173536, "step": 377300 }, { "epoch": 7.430450276623811, "grad_norm": 1.9688060283660889, "learning_rate": 5.072933759593506e-07, "loss": 1.9709, "num_input_tokens_seen": 384274624, "step": 377400 }, { "epoch": 7.432419129373314, "grad_norm": 2.9141573905944824, "learning_rate": 5.070872167970712e-07, "loss": 1.9723, "num_input_tokens_seen": 384377024, "step": 377500 }, { "epoch": 7.434387982122817, "grad_norm": 1.8954172134399414, "learning_rate": 5.068810564296702e-07, "loss": 1.9585, "num_input_tokens_seen": 384479424, "step": 377600 }, { "epoch": 7.43635683487232, "grad_norm": 1.6338471174240112, "learning_rate": 5.066748948922039e-07, "loss": 1.9816, "num_input_tokens_seen": 384581824, "step": 377700 }, { "epoch": 7.4383256876218224, "grad_norm": 2.2469611167907715, "learning_rate": 5.064687322197277e-07, "loss": 1.9873, "num_input_tokens_seen": 384683360, "step": 377800 }, { "epoch": 7.440294540371326, "grad_norm": 1.7577176094055176, "learning_rate": 5.062625684472981e-07, "loss": 2.0197, "num_input_tokens_seen": 384784528, "step": 377900 }, { "epoch": 7.442263393120829, "grad_norm": 1.6048203706741333, "learning_rate": 5.060564036099718e-07, "loss": 1.9432, "num_input_tokens_seen": 384886928, "step": 378000 }, { "epoch": 7.444232245870332, "grad_norm": 1.8339276313781738, "learning_rate": 5.05850237742805e-07, "loss": 1.9394, "num_input_tokens_seen": 384989328, "step": 378100 }, { "epoch": 7.446201098619834, "grad_norm": 1.7405064105987549, "learning_rate": 5.056440708808547e-07, "loss": 1.9962, "num_input_tokens_seen": 385091728, "step": 378200 }, { "epoch": 7.448169951369337, "grad_norm": 2.010094165802002, "learning_rate": 5.054379030591777e-07, "loss": 1.9904, "num_input_tokens_seen": 385193256, "step": 378300 }, { "epoch": 7.45013880411884, "grad_norm": 1.825519323348999, "learning_rate": 5.052317343128312e-07, "loss": 2.005, "num_input_tokens_seen": 385294232, "step": 378400 }, { "epoch": 7.452107656868343, "grad_norm": 1.987107515335083, "learning_rate": 5.050255646768723e-07, "loss": 1.9861, "num_input_tokens_seen": 385396632, "step": 378500 }, { "epoch": 7.454076509617845, "grad_norm": 1.8715169429779053, "learning_rate": 5.048193941863586e-07, "loss": 1.9449, "num_input_tokens_seen": 385499032, "step": 378600 }, { "epoch": 7.456045362367348, "grad_norm": 1.9312773942947388, "learning_rate": 5.046132228763476e-07, "loss": 2.0466, "num_input_tokens_seen": 385601432, "step": 378700 }, { "epoch": 7.458014215116852, "grad_norm": 1.7639271020889282, "learning_rate": 5.044070507818968e-07, "loss": 1.9658, "num_input_tokens_seen": 385702968, "step": 378800 }, { "epoch": 7.459983067866355, "grad_norm": 1.7868900299072266, "learning_rate": 5.042008779380642e-07, "loss": 1.98, "num_input_tokens_seen": 385805368, "step": 378900 }, { "epoch": 7.461951920615857, "grad_norm": 2.0073390007019043, "learning_rate": 5.039947043799078e-07, "loss": 1.9442, "num_input_tokens_seen": 385906328, "step": 379000 }, { "epoch": 7.46392077336536, "grad_norm": 3.7053864002227783, "learning_rate": 5.037885301424857e-07, "loss": 1.9677, "num_input_tokens_seen": 386007952, "step": 379100 }, { "epoch": 7.465889626114863, "grad_norm": 2.0953264236450195, "learning_rate": 5.035823552608561e-07, "loss": 2.0439, "num_input_tokens_seen": 386109376, "step": 379200 }, { "epoch": 7.467858478864366, "grad_norm": 2.27166748046875, "learning_rate": 5.033761797700769e-07, "loss": 2.0019, "num_input_tokens_seen": 386211112, "step": 379300 }, { "epoch": 7.469827331613868, "grad_norm": 1.8703604936599731, "learning_rate": 5.031700037052071e-07, "loss": 1.9618, "num_input_tokens_seen": 386313512, "step": 379400 }, { "epoch": 7.471796184363371, "grad_norm": 1.7375736236572266, "learning_rate": 5.029638271013047e-07, "loss": 1.9814, "num_input_tokens_seen": 386415912, "step": 379500 }, { "epoch": 7.473765037112875, "grad_norm": 1.7769442796707153, "learning_rate": 5.027576499934286e-07, "loss": 1.9812, "num_input_tokens_seen": 386518312, "step": 379600 }, { "epoch": 7.4757338898623775, "grad_norm": 1.9671293497085571, "learning_rate": 5.025514724166373e-07, "loss": 2.0277, "num_input_tokens_seen": 386620112, "step": 379700 }, { "epoch": 7.47770274261188, "grad_norm": 1.70367431640625, "learning_rate": 5.023452944059896e-07, "loss": 1.9941, "num_input_tokens_seen": 386721640, "step": 379800 }, { "epoch": 7.479671595361383, "grad_norm": 1.8726248741149902, "learning_rate": 5.021391159965444e-07, "loss": 2.0435, "num_input_tokens_seen": 386824040, "step": 379900 }, { "epoch": 7.481640448110886, "grad_norm": 1.744035005569458, "learning_rate": 5.019329372233607e-07, "loss": 1.9917, "num_input_tokens_seen": 386925888, "step": 380000 }, { "epoch": 7.483609300860389, "grad_norm": 2.1568663120269775, "learning_rate": 5.017267581214971e-07, "loss": 1.9669, "num_input_tokens_seen": 387028288, "step": 380100 }, { "epoch": 7.485578153609891, "grad_norm": 1.7321265935897827, "learning_rate": 5.015205787260131e-07, "loss": 2.0032, "num_input_tokens_seen": 387129536, "step": 380200 }, { "epoch": 7.487547006359394, "grad_norm": 2.007984161376953, "learning_rate": 5.013143990719674e-07, "loss": 1.98, "num_input_tokens_seen": 387231232, "step": 380300 }, { "epoch": 7.489515859108897, "grad_norm": 2.950364589691162, "learning_rate": 5.011082191944193e-07, "loss": 1.9772, "num_input_tokens_seen": 387333184, "step": 380400 }, { "epoch": 7.4914847118584005, "grad_norm": 2.139864683151245, "learning_rate": 5.009020391284277e-07, "loss": 1.9928, "num_input_tokens_seen": 387434408, "step": 380500 }, { "epoch": 7.493453564607903, "grad_norm": 1.9698947668075562, "learning_rate": 5.006958589090521e-07, "loss": 1.9968, "num_input_tokens_seen": 387536808, "step": 380600 }, { "epoch": 7.495422417357406, "grad_norm": 1.985487461090088, "learning_rate": 5.004896785713515e-07, "loss": 1.9626, "num_input_tokens_seen": 387639208, "step": 380700 }, { "epoch": 7.497391270106909, "grad_norm": 2.3485677242279053, "learning_rate": 5.002834981503852e-07, "loss": 2.0093, "num_input_tokens_seen": 387741112, "step": 380800 }, { "epoch": 7.4993601228564115, "grad_norm": 3.384591817855835, "learning_rate": 5.000773176812125e-07, "loss": 2.0292, "num_input_tokens_seen": 387842600, "step": 380900 }, { "epoch": 7.501328975605914, "grad_norm": 1.7954802513122559, "learning_rate": 4.998711371988923e-07, "loss": 1.9622, "num_input_tokens_seen": 387943632, "step": 381000 }, { "epoch": 7.503297828355417, "grad_norm": 1.9172871112823486, "learning_rate": 4.996649567384845e-07, "loss": 2.0529, "num_input_tokens_seen": 388044400, "step": 381100 }, { "epoch": 7.50526668110492, "grad_norm": 3.3087704181671143, "learning_rate": 4.994587763350477e-07, "loss": 1.9565, "num_input_tokens_seen": 388145936, "step": 381200 }, { "epoch": 7.507235533854423, "grad_norm": 1.7594445943832397, "learning_rate": 4.992525960236417e-07, "loss": 1.9969, "num_input_tokens_seen": 388248336, "step": 381300 }, { "epoch": 7.509204386603926, "grad_norm": 1.5387449264526367, "learning_rate": 4.990464158393252e-07, "loss": 1.956, "num_input_tokens_seen": 388350736, "step": 381400 }, { "epoch": 7.511173239353429, "grad_norm": 2.319288969039917, "learning_rate": 4.988402358171577e-07, "loss": 1.9539, "num_input_tokens_seen": 388452512, "step": 381500 }, { "epoch": 7.513142092102932, "grad_norm": 2.0414860248565674, "learning_rate": 4.986340559921984e-07, "loss": 2.0061, "num_input_tokens_seen": 388554248, "step": 381600 }, { "epoch": 7.5151109448524345, "grad_norm": 5.795549392700195, "learning_rate": 4.984278763995062e-07, "loss": 1.9892, "num_input_tokens_seen": 388656648, "step": 381700 }, { "epoch": 7.517079797601937, "grad_norm": 1.7130744457244873, "learning_rate": 4.982216970741406e-07, "loss": 1.98, "num_input_tokens_seen": 388757640, "step": 381800 }, { "epoch": 7.51904865035144, "grad_norm": 2.2430741786956787, "learning_rate": 4.980155180511602e-07, "loss": 2.0105, "num_input_tokens_seen": 388860040, "step": 381900 }, { "epoch": 7.521017503100943, "grad_norm": 2.157201051712036, "learning_rate": 4.978093393656243e-07, "loss": 2.0139, "num_input_tokens_seen": 388960760, "step": 382000 }, { "epoch": 7.5229863558504455, "grad_norm": 2.068794012069702, "learning_rate": 4.976031610525918e-07, "loss": 1.9737, "num_input_tokens_seen": 389062648, "step": 382100 }, { "epoch": 7.524955208599949, "grad_norm": 2.1377761363983154, "learning_rate": 4.973969831471215e-07, "loss": 2.0052, "num_input_tokens_seen": 389164152, "step": 382200 }, { "epoch": 7.526924061349452, "grad_norm": 1.9068855047225952, "learning_rate": 4.971908056842723e-07, "loss": 1.9984, "num_input_tokens_seen": 389266552, "step": 382300 }, { "epoch": 7.528892914098955, "grad_norm": 1.7767568826675415, "learning_rate": 4.969846286991027e-07, "loss": 1.9902, "num_input_tokens_seen": 389368064, "step": 382400 }, { "epoch": 7.530861766848457, "grad_norm": 1.9303306341171265, "learning_rate": 4.967784522266718e-07, "loss": 1.966, "num_input_tokens_seen": 389469832, "step": 382500 }, { "epoch": 7.53283061959796, "grad_norm": 2.213029623031616, "learning_rate": 4.965722763020377e-07, "loss": 2.0059, "num_input_tokens_seen": 389570504, "step": 382600 }, { "epoch": 7.534799472347463, "grad_norm": 1.8339900970458984, "learning_rate": 4.963661009602594e-07, "loss": 1.9822, "num_input_tokens_seen": 389672904, "step": 382700 }, { "epoch": 7.536768325096966, "grad_norm": 1.7456740140914917, "learning_rate": 4.961599262363947e-07, "loss": 2.0026, "num_input_tokens_seen": 389773800, "step": 382800 }, { "epoch": 7.538737177846469, "grad_norm": 2.0507090091705322, "learning_rate": 4.959537521655025e-07, "loss": 2.0021, "num_input_tokens_seen": 389875648, "step": 382900 }, { "epoch": 7.540706030595972, "grad_norm": 2.145613431930542, "learning_rate": 4.957475787826402e-07, "loss": 1.9656, "num_input_tokens_seen": 389977280, "step": 383000 }, { "epoch": 7.542674883345475, "grad_norm": 1.9413028955459595, "learning_rate": 4.955414061228667e-07, "loss": 1.9959, "num_input_tokens_seen": 390078728, "step": 383100 }, { "epoch": 7.544643736094978, "grad_norm": 1.96881103515625, "learning_rate": 4.95335234221239e-07, "loss": 2.0271, "num_input_tokens_seen": 390181128, "step": 383200 }, { "epoch": 7.54661258884448, "grad_norm": 1.736331582069397, "learning_rate": 4.951290631128159e-07, "loss": 1.9793, "num_input_tokens_seen": 390283296, "step": 383300 }, { "epoch": 7.548581441593983, "grad_norm": 1.6167480945587158, "learning_rate": 4.949228928326541e-07, "loss": 1.9816, "num_input_tokens_seen": 390385424, "step": 383400 }, { "epoch": 7.550550294343486, "grad_norm": 1.916285514831543, "learning_rate": 4.947167234158116e-07, "loss": 1.9783, "num_input_tokens_seen": 390487824, "step": 383500 }, { "epoch": 7.552519147092989, "grad_norm": 2.083465099334717, "learning_rate": 4.945105548973459e-07, "loss": 1.9687, "num_input_tokens_seen": 390589808, "step": 383600 }, { "epoch": 7.554487999842491, "grad_norm": 1.9483658075332642, "learning_rate": 4.94304387312314e-07, "loss": 2.0251, "num_input_tokens_seen": 390690984, "step": 383700 }, { "epoch": 7.556456852591994, "grad_norm": 1.8473742008209229, "learning_rate": 4.94098220695773e-07, "loss": 1.9761, "num_input_tokens_seen": 390792944, "step": 383800 }, { "epoch": 7.558425705341498, "grad_norm": 2.0336999893188477, "learning_rate": 4.938920550827797e-07, "loss": 2.0295, "num_input_tokens_seen": 390892968, "step": 383900 }, { "epoch": 7.560394558091001, "grad_norm": 1.696427345275879, "learning_rate": 4.936858905083911e-07, "loss": 1.9938, "num_input_tokens_seen": 390995368, "step": 384000 }, { "epoch": 7.562363410840503, "grad_norm": 2.0347323417663574, "learning_rate": 4.934797270076633e-07, "loss": 1.9942, "num_input_tokens_seen": 391097768, "step": 384100 }, { "epoch": 7.564332263590006, "grad_norm": 1.7522668838500977, "learning_rate": 4.932735646156531e-07, "loss": 1.974, "num_input_tokens_seen": 391199616, "step": 384200 }, { "epoch": 7.566301116339509, "grad_norm": 1.8831778764724731, "learning_rate": 4.930674033674164e-07, "loss": 1.9561, "num_input_tokens_seen": 391301424, "step": 384300 }, { "epoch": 7.568269969089012, "grad_norm": 1.9880766868591309, "learning_rate": 4.928612432980095e-07, "loss": 1.9855, "num_input_tokens_seen": 391402448, "step": 384400 }, { "epoch": 7.570238821838514, "grad_norm": 1.921152949333191, "learning_rate": 4.926550844424879e-07, "loss": 1.9937, "num_input_tokens_seen": 391504024, "step": 384500 }, { "epoch": 7.572207674588018, "grad_norm": 1.8683234453201294, "learning_rate": 4.92448926835907e-07, "loss": 1.9792, "num_input_tokens_seen": 391605728, "step": 384600 }, { "epoch": 7.574176527337521, "grad_norm": 1.8247352838516235, "learning_rate": 4.922427705133227e-07, "loss": 1.9588, "num_input_tokens_seen": 391707208, "step": 384700 }, { "epoch": 7.5761453800870235, "grad_norm": 1.791880488395691, "learning_rate": 4.920366155097896e-07, "loss": 1.9357, "num_input_tokens_seen": 391809608, "step": 384800 }, { "epoch": 7.578114232836526, "grad_norm": 1.5522576570510864, "learning_rate": 4.918304618603631e-07, "loss": 1.9654, "num_input_tokens_seen": 391912008, "step": 384900 }, { "epoch": 7.580083085586029, "grad_norm": 1.8918328285217285, "learning_rate": 4.916243096000975e-07, "loss": 1.9744, "num_input_tokens_seen": 392014408, "step": 385000 }, { "epoch": 7.582051938335532, "grad_norm": 1.887258529663086, "learning_rate": 4.914181587640474e-07, "loss": 2.0231, "num_input_tokens_seen": 392116808, "step": 385100 }, { "epoch": 7.584020791085035, "grad_norm": 1.9370651245117188, "learning_rate": 4.912120093872671e-07, "loss": 2.009, "num_input_tokens_seen": 392219208, "step": 385200 }, { "epoch": 7.585989643834537, "grad_norm": 1.9454143047332764, "learning_rate": 4.910058615048106e-07, "loss": 2.024, "num_input_tokens_seen": 392321608, "step": 385300 }, { "epoch": 7.58795849658404, "grad_norm": 2.1019110679626465, "learning_rate": 4.907997151517312e-07, "loss": 1.9752, "num_input_tokens_seen": 392424008, "step": 385400 }, { "epoch": 7.589927349333543, "grad_norm": 2.675548791885376, "learning_rate": 4.905935703630827e-07, "loss": 2.0203, "num_input_tokens_seen": 392525792, "step": 385500 }, { "epoch": 7.5918962020830465, "grad_norm": 1.801592469215393, "learning_rate": 4.903874271739183e-07, "loss": 2.0493, "num_input_tokens_seen": 392627240, "step": 385600 }, { "epoch": 7.593865054832549, "grad_norm": 1.9246045351028442, "learning_rate": 4.901812856192907e-07, "loss": 1.9721, "num_input_tokens_seen": 392728072, "step": 385700 }, { "epoch": 7.595833907582052, "grad_norm": 1.9143521785736084, "learning_rate": 4.89975145734253e-07, "loss": 1.958, "num_input_tokens_seen": 392829648, "step": 385800 }, { "epoch": 7.597802760331555, "grad_norm": 2.024423837661743, "learning_rate": 4.897690075538569e-07, "loss": 1.9792, "num_input_tokens_seen": 392932048, "step": 385900 }, { "epoch": 7.5997716130810575, "grad_norm": 1.9278972148895264, "learning_rate": 4.89562871113155e-07, "loss": 1.9674, "num_input_tokens_seen": 393033064, "step": 386000 }, { "epoch": 7.60174046583056, "grad_norm": 2.0642013549804688, "learning_rate": 4.893567364471987e-07, "loss": 1.9564, "num_input_tokens_seen": 393134944, "step": 386100 }, { "epoch": 7.603709318580063, "grad_norm": 2.1293320655822754, "learning_rate": 4.891506035910397e-07, "loss": 1.9998, "num_input_tokens_seen": 393235272, "step": 386200 }, { "epoch": 7.605678171329567, "grad_norm": 1.9950129985809326, "learning_rate": 4.889444725797289e-07, "loss": 1.9973, "num_input_tokens_seen": 393337672, "step": 386300 }, { "epoch": 7.6076470240790695, "grad_norm": 4.244419574737549, "learning_rate": 4.887383434483174e-07, "loss": 1.9796, "num_input_tokens_seen": 393438960, "step": 386400 }, { "epoch": 7.609615876828572, "grad_norm": 1.8216191530227661, "learning_rate": 4.885322162318555e-07, "loss": 1.9711, "num_input_tokens_seen": 393541000, "step": 386500 }, { "epoch": 7.611584729578075, "grad_norm": 1.9653093814849854, "learning_rate": 4.883260909653936e-07, "loss": 1.9856, "num_input_tokens_seen": 393643400, "step": 386600 }, { "epoch": 7.613553582327578, "grad_norm": 1.881364345550537, "learning_rate": 4.881199676839815e-07, "loss": 2.0015, "num_input_tokens_seen": 393745800, "step": 386700 }, { "epoch": 7.6155224350770805, "grad_norm": 2.109998941421509, "learning_rate": 4.879138464226685e-07, "loss": 1.9924, "num_input_tokens_seen": 393847664, "step": 386800 }, { "epoch": 7.617491287826583, "grad_norm": 1.764736533164978, "learning_rate": 4.877077272165042e-07, "loss": 1.9937, "num_input_tokens_seen": 393949472, "step": 386900 }, { "epoch": 7.619460140576086, "grad_norm": 1.9379056692123413, "learning_rate": 4.875016101005371e-07, "loss": 1.9419, "num_input_tokens_seen": 394051872, "step": 387000 }, { "epoch": 7.621428993325589, "grad_norm": 2.092463254928589, "learning_rate": 4.87295495109816e-07, "loss": 2.0024, "num_input_tokens_seen": 394154272, "step": 387100 }, { "epoch": 7.623397846075092, "grad_norm": 3.187513589859009, "learning_rate": 4.870893822793885e-07, "loss": 1.9831, "num_input_tokens_seen": 394256672, "step": 387200 }, { "epoch": 7.625366698824595, "grad_norm": 1.91410231590271, "learning_rate": 4.868832716443028e-07, "loss": 1.953, "num_input_tokens_seen": 394359072, "step": 387300 }, { "epoch": 7.627335551574098, "grad_norm": 1.9732996225357056, "learning_rate": 4.866771632396061e-07, "loss": 2.0144, "num_input_tokens_seen": 394460712, "step": 387400 }, { "epoch": 7.629304404323601, "grad_norm": 2.05133056640625, "learning_rate": 4.864710571003452e-07, "loss": 1.9703, "num_input_tokens_seen": 394562384, "step": 387500 }, { "epoch": 7.6312732570731034, "grad_norm": 1.8731197118759155, "learning_rate": 4.862649532615674e-07, "loss": 1.9779, "num_input_tokens_seen": 394664784, "step": 387600 }, { "epoch": 7.633242109822606, "grad_norm": 1.89797842502594, "learning_rate": 4.86058851758318e-07, "loss": 1.9999, "num_input_tokens_seen": 394765608, "step": 387700 }, { "epoch": 7.635210962572109, "grad_norm": 1.8504283428192139, "learning_rate": 4.858527526256436e-07, "loss": 1.9336, "num_input_tokens_seen": 394868008, "step": 387800 }, { "epoch": 7.637179815321612, "grad_norm": 2.03739595413208, "learning_rate": 4.856466558985891e-07, "loss": 1.979, "num_input_tokens_seen": 394969544, "step": 387900 }, { "epoch": 7.639148668071115, "grad_norm": 1.8224365711212158, "learning_rate": 4.854405616122e-07, "loss": 1.9796, "num_input_tokens_seen": 395071944, "step": 388000 }, { "epoch": 7.641117520820618, "grad_norm": 1.924677848815918, "learning_rate": 4.852344698015201e-07, "loss": 1.9725, "num_input_tokens_seen": 395171824, "step": 388100 }, { "epoch": 7.643086373570121, "grad_norm": 1.9571551084518433, "learning_rate": 4.850283805015945e-07, "loss": 1.9663, "num_input_tokens_seen": 395272880, "step": 388200 }, { "epoch": 7.645055226319624, "grad_norm": 2.1420321464538574, "learning_rate": 4.848222937474662e-07, "loss": 1.9934, "num_input_tokens_seen": 395375280, "step": 388300 }, { "epoch": 7.647024079069126, "grad_norm": 2.246696710586548, "learning_rate": 4.846162095741792e-07, "loss": 1.9712, "num_input_tokens_seen": 395477680, "step": 388400 }, { "epoch": 7.648992931818629, "grad_norm": 1.9481399059295654, "learning_rate": 4.844101280167757e-07, "loss": 1.9636, "num_input_tokens_seen": 395579736, "step": 388500 }, { "epoch": 7.650961784568132, "grad_norm": 1.8843986988067627, "learning_rate": 4.842040491102984e-07, "loss": 1.9908, "num_input_tokens_seen": 395681648, "step": 388600 }, { "epoch": 7.652930637317635, "grad_norm": 2.112291097640991, "learning_rate": 4.839979728897896e-07, "loss": 2.0163, "num_input_tokens_seen": 395782360, "step": 388700 }, { "epoch": 7.654899490067137, "grad_norm": 2.6145987510681152, "learning_rate": 4.837918993902903e-07, "loss": 1.9773, "num_input_tokens_seen": 395884760, "step": 388800 }, { "epoch": 7.656868342816641, "grad_norm": 1.907364010810852, "learning_rate": 4.83585828646842e-07, "loss": 2.0128, "num_input_tokens_seen": 395986296, "step": 388900 }, { "epoch": 7.658837195566144, "grad_norm": 1.9746867418289185, "learning_rate": 4.833797606944849e-07, "loss": 2.0028, "num_input_tokens_seen": 396088696, "step": 389000 }, { "epoch": 7.660806048315647, "grad_norm": 7.818428039550781, "learning_rate": 4.831736955682594e-07, "loss": 2.0206, "num_input_tokens_seen": 396186872, "step": 389100 }, { "epoch": 7.662774901065149, "grad_norm": 2.0519790649414062, "learning_rate": 4.829676333032052e-07, "loss": 1.966, "num_input_tokens_seen": 396288552, "step": 389200 }, { "epoch": 7.664743753814652, "grad_norm": 2.0729544162750244, "learning_rate": 4.827615739343611e-07, "loss": 1.9273, "num_input_tokens_seen": 396390520, "step": 389300 }, { "epoch": 7.666712606564155, "grad_norm": 1.708318829536438, "learning_rate": 4.82555517496766e-07, "loss": 1.9807, "num_input_tokens_seen": 396492920, "step": 389400 }, { "epoch": 7.668681459313658, "grad_norm": 2.070146083831787, "learning_rate": 4.82349464025458e-07, "loss": 1.9808, "num_input_tokens_seen": 396594784, "step": 389500 }, { "epoch": 7.670650312063161, "grad_norm": 1.8097540140151978, "learning_rate": 4.821434135554749e-07, "loss": 1.9423, "num_input_tokens_seen": 396696568, "step": 389600 }, { "epoch": 7.672619164812664, "grad_norm": 1.9803106784820557, "learning_rate": 4.819373661218535e-07, "loss": 1.9883, "num_input_tokens_seen": 396797000, "step": 389700 }, { "epoch": 7.674588017562167, "grad_norm": 2.093808650970459, "learning_rate": 4.817313217596309e-07, "loss": 1.9903, "num_input_tokens_seen": 396899400, "step": 389800 }, { "epoch": 7.67655687031167, "grad_norm": 1.857288122177124, "learning_rate": 4.815252805038427e-07, "loss": 1.9834, "num_input_tokens_seen": 397001352, "step": 389900 }, { "epoch": 7.678525723061172, "grad_norm": 1.878383994102478, "learning_rate": 4.813192423895249e-07, "loss": 1.9938, "num_input_tokens_seen": 397103120, "step": 390000 }, { "epoch": 7.680494575810675, "grad_norm": 2.043911933898926, "learning_rate": 4.811132074517123e-07, "loss": 1.9765, "num_input_tokens_seen": 397203488, "step": 390100 }, { "epoch": 7.682463428560178, "grad_norm": 2.0450711250305176, "learning_rate": 4.809071757254396e-07, "loss": 1.988, "num_input_tokens_seen": 397305888, "step": 390200 }, { "epoch": 7.684432281309681, "grad_norm": 1.8975472450256348, "learning_rate": 4.807011472457404e-07, "loss": 1.9803, "num_input_tokens_seen": 397406536, "step": 390300 }, { "epoch": 7.686401134059183, "grad_norm": 1.8599958419799805, "learning_rate": 4.804951220476485e-07, "loss": 1.9725, "num_input_tokens_seen": 397508936, "step": 390400 }, { "epoch": 7.688369986808686, "grad_norm": 2.0340805053710938, "learning_rate": 4.802891001661966e-07, "loss": 1.9621, "num_input_tokens_seen": 397611336, "step": 390500 }, { "epoch": 7.69033883955819, "grad_norm": 1.8969095945358276, "learning_rate": 4.800830816364167e-07, "loss": 1.9509, "num_input_tokens_seen": 397713048, "step": 390600 }, { "epoch": 7.6923076923076925, "grad_norm": 3.9885106086730957, "learning_rate": 4.798770664933412e-07, "loss": 1.9888, "num_input_tokens_seen": 397815448, "step": 390700 }, { "epoch": 7.694276545057195, "grad_norm": 1.7530561685562134, "learning_rate": 4.796710547720004e-07, "loss": 1.9934, "num_input_tokens_seen": 397917848, "step": 390800 }, { "epoch": 7.696245397806698, "grad_norm": 1.9657628536224365, "learning_rate": 4.794650465074256e-07, "loss": 1.9506, "num_input_tokens_seen": 398019352, "step": 390900 }, { "epoch": 7.698214250556201, "grad_norm": 1.740928053855896, "learning_rate": 4.792590417346462e-07, "loss": 2.0059, "num_input_tokens_seen": 398121752, "step": 391000 }, { "epoch": 7.7001831033057035, "grad_norm": 1.9120116233825684, "learning_rate": 4.790530404886921e-07, "loss": 1.9725, "num_input_tokens_seen": 398224152, "step": 391100 }, { "epoch": 7.702151956055206, "grad_norm": 1.888346552848816, "learning_rate": 4.788470428045915e-07, "loss": 1.9833, "num_input_tokens_seen": 398326000, "step": 391200 }, { "epoch": 7.70412080880471, "grad_norm": 2.0434091091156006, "learning_rate": 4.786410487173732e-07, "loss": 2.0046, "num_input_tokens_seen": 398427544, "step": 391300 }, { "epoch": 7.706089661554213, "grad_norm": 1.8175299167633057, "learning_rate": 4.78435058262064e-07, "loss": 2.0173, "num_input_tokens_seen": 398529944, "step": 391400 }, { "epoch": 7.7080585143037155, "grad_norm": 1.7560431957244873, "learning_rate": 4.782290714736917e-07, "loss": 1.9428, "num_input_tokens_seen": 398632344, "step": 391500 }, { "epoch": 7.710027367053218, "grad_norm": 2.2701058387756348, "learning_rate": 4.780230883872817e-07, "loss": 1.9698, "num_input_tokens_seen": 398734648, "step": 391600 }, { "epoch": 7.711996219802721, "grad_norm": 1.9336497783660889, "learning_rate": 4.778171090378603e-07, "loss": 2.0544, "num_input_tokens_seen": 398836128, "step": 391700 }, { "epoch": 7.713965072552224, "grad_norm": 1.927473783493042, "learning_rate": 4.776111334604525e-07, "loss": 2.0082, "num_input_tokens_seen": 398937160, "step": 391800 }, { "epoch": 7.7159339253017265, "grad_norm": 1.9075239896774292, "learning_rate": 4.774051616900827e-07, "loss": 2.0238, "num_input_tokens_seen": 399039032, "step": 391900 }, { "epoch": 7.717902778051229, "grad_norm": 1.7816892862319946, "learning_rate": 4.771991937617745e-07, "loss": 2.0645, "num_input_tokens_seen": 399141432, "step": 392000 }, { "epoch": 7.719871630800732, "grad_norm": 1.7579230070114136, "learning_rate": 4.769932297105511e-07, "loss": 1.9769, "num_input_tokens_seen": 399243832, "step": 392100 }, { "epoch": 7.721840483550235, "grad_norm": 1.818023443222046, "learning_rate": 4.7678726957143496e-07, "loss": 1.9954, "num_input_tokens_seen": 399345736, "step": 392200 }, { "epoch": 7.723809336299738, "grad_norm": 2.290372610092163, "learning_rate": 4.7658131337944773e-07, "loss": 1.976, "num_input_tokens_seen": 399447264, "step": 392300 }, { "epoch": 7.725778189049241, "grad_norm": 1.9023890495300293, "learning_rate": 4.763753611696108e-07, "loss": 1.9823, "num_input_tokens_seen": 399548816, "step": 392400 }, { "epoch": 7.727747041798744, "grad_norm": 1.8353853225708008, "learning_rate": 4.761694129769443e-07, "loss": 2.0116, "num_input_tokens_seen": 399648552, "step": 392500 }, { "epoch": 7.729715894548247, "grad_norm": 1.8889423608779907, "learning_rate": 4.7596346883646813e-07, "loss": 1.9611, "num_input_tokens_seen": 399748816, "step": 392600 }, { "epoch": 7.7316847472977495, "grad_norm": 1.9612642526626587, "learning_rate": 4.7575752878320144e-07, "loss": 1.9828, "num_input_tokens_seen": 399851216, "step": 392700 }, { "epoch": 7.733653600047252, "grad_norm": 2.140599250793457, "learning_rate": 4.7555159285216247e-07, "loss": 2.0269, "num_input_tokens_seen": 399950960, "step": 392800 }, { "epoch": 7.735622452796755, "grad_norm": 1.8909140825271606, "learning_rate": 4.75345661078369e-07, "loss": 1.9684, "num_input_tokens_seen": 400053360, "step": 392900 }, { "epoch": 7.737591305546259, "grad_norm": 1.9300559759140015, "learning_rate": 4.751397334968378e-07, "loss": 1.9843, "num_input_tokens_seen": 400154744, "step": 393000 }, { "epoch": 7.739560158295761, "grad_norm": 2.045980215072632, "learning_rate": 4.749338101425854e-07, "loss": 1.9915, "num_input_tokens_seen": 400256528, "step": 393100 }, { "epoch": 7.741529011045264, "grad_norm": 1.9019083976745605, "learning_rate": 4.74727891050627e-07, "loss": 1.9644, "num_input_tokens_seen": 400358928, "step": 393200 }, { "epoch": 7.743497863794767, "grad_norm": 1.8340952396392822, "learning_rate": 4.7452197625597766e-07, "loss": 1.9546, "num_input_tokens_seen": 400461328, "step": 393300 }, { "epoch": 7.74546671654427, "grad_norm": 2.0176641941070557, "learning_rate": 4.743160657936513e-07, "loss": 1.9918, "num_input_tokens_seen": 400563576, "step": 393400 }, { "epoch": 7.747435569293772, "grad_norm": 1.8577077388763428, "learning_rate": 4.741101596986614e-07, "loss": 1.9688, "num_input_tokens_seen": 400665368, "step": 393500 }, { "epoch": 7.749404422043275, "grad_norm": 2.1358959674835205, "learning_rate": 4.739042580060204e-07, "loss": 1.9926, "num_input_tokens_seen": 400767768, "step": 393600 }, { "epoch": 7.751373274792778, "grad_norm": 1.9763044118881226, "learning_rate": 4.736983607507401e-07, "loss": 1.9844, "num_input_tokens_seen": 400867752, "step": 393700 }, { "epoch": 7.753342127542281, "grad_norm": 1.8901817798614502, "learning_rate": 4.7349246796783196e-07, "loss": 2.0072, "num_input_tokens_seen": 400970152, "step": 393800 }, { "epoch": 7.755310980291784, "grad_norm": 1.8803762197494507, "learning_rate": 4.7328657969230597e-07, "loss": 1.9447, "num_input_tokens_seen": 401072552, "step": 393900 }, { "epoch": 7.757279833041287, "grad_norm": 1.8106716871261597, "learning_rate": 4.7308069595917183e-07, "loss": 1.9471, "num_input_tokens_seen": 401174952, "step": 394000 }, { "epoch": 7.75924868579079, "grad_norm": 1.8555454015731812, "learning_rate": 4.728748168034382e-07, "loss": 1.9743, "num_input_tokens_seen": 401276432, "step": 394100 }, { "epoch": 7.761217538540293, "grad_norm": 1.8103944063186646, "learning_rate": 4.726689422601133e-07, "loss": 1.977, "num_input_tokens_seen": 401377968, "step": 394200 }, { "epoch": 7.763186391289795, "grad_norm": 1.7369400262832642, "learning_rate": 4.724630723642042e-07, "loss": 1.9881, "num_input_tokens_seen": 401479664, "step": 394300 }, { "epoch": 7.765155244039298, "grad_norm": 1.6876634359359741, "learning_rate": 4.722572071507175e-07, "loss": 2.0008, "num_input_tokens_seen": 401581496, "step": 394400 }, { "epoch": 7.767124096788801, "grad_norm": 1.995707392692566, "learning_rate": 4.720513466546586e-07, "loss": 1.9945, "num_input_tokens_seen": 401682520, "step": 394500 }, { "epoch": 7.769092949538304, "grad_norm": 1.9053715467453003, "learning_rate": 4.718454909110324e-07, "loss": 1.9644, "num_input_tokens_seen": 401783360, "step": 394600 }, { "epoch": 7.771061802287807, "grad_norm": 1.96388840675354, "learning_rate": 4.7163963995484316e-07, "loss": 1.9831, "num_input_tokens_seen": 401884904, "step": 394700 }, { "epoch": 7.77303065503731, "grad_norm": 2.2016756534576416, "learning_rate": 4.7143379382109386e-07, "loss": 2.0089, "num_input_tokens_seen": 401987304, "step": 394800 }, { "epoch": 7.774999507786813, "grad_norm": 1.8871245384216309, "learning_rate": 4.712279525447871e-07, "loss": 1.9891, "num_input_tokens_seen": 402089704, "step": 394900 }, { "epoch": 7.776968360536316, "grad_norm": 1.7828187942504883, "learning_rate": 4.710221161609242e-07, "loss": 2.0138, "num_input_tokens_seen": 402192104, "step": 395000 }, { "epoch": 7.778937213285818, "grad_norm": 1.8966572284698486, "learning_rate": 4.7081628470450615e-07, "loss": 1.9889, "num_input_tokens_seen": 402293080, "step": 395100 }, { "epoch": 7.780906066035321, "grad_norm": 1.9374722242355347, "learning_rate": 4.706104582105326e-07, "loss": 1.9743, "num_input_tokens_seen": 402394952, "step": 395200 }, { "epoch": 7.782874918784824, "grad_norm": 1.7325857877731323, "learning_rate": 4.704046367140029e-07, "loss": 1.958, "num_input_tokens_seen": 402497352, "step": 395300 }, { "epoch": 7.784843771534327, "grad_norm": 1.7959673404693604, "learning_rate": 4.701988202499149e-07, "loss": 1.9683, "num_input_tokens_seen": 402599664, "step": 395400 }, { "epoch": 7.786812624283829, "grad_norm": 1.869259238243103, "learning_rate": 4.6999300885326636e-07, "loss": 1.9328, "num_input_tokens_seen": 402701568, "step": 395500 }, { "epoch": 7.788781477033333, "grad_norm": 1.8868144750595093, "learning_rate": 4.697872025590535e-07, "loss": 1.9937, "num_input_tokens_seen": 402802392, "step": 395600 }, { "epoch": 7.790750329782836, "grad_norm": 1.6757222414016724, "learning_rate": 4.695814014022719e-07, "loss": 2.0285, "num_input_tokens_seen": 402904168, "step": 395700 }, { "epoch": 7.7927191825323385, "grad_norm": 2.2087771892547607, "learning_rate": 4.6937560541791666e-07, "loss": 1.9824, "num_input_tokens_seen": 403006568, "step": 395800 }, { "epoch": 7.794688035281841, "grad_norm": 1.9810230731964111, "learning_rate": 4.6916981464098114e-07, "loss": 1.9791, "num_input_tokens_seen": 403108968, "step": 395900 }, { "epoch": 7.796656888031344, "grad_norm": 1.995098352432251, "learning_rate": 4.68964029106459e-07, "loss": 2.0055, "num_input_tokens_seen": 403211368, "step": 396000 }, { "epoch": 7.798625740780847, "grad_norm": 2.033402442932129, "learning_rate": 4.687582488493415e-07, "loss": 1.9772, "num_input_tokens_seen": 403312368, "step": 396100 }, { "epoch": 7.80059459353035, "grad_norm": 1.8263920545578003, "learning_rate": 4.6855247390462073e-07, "loss": 1.9769, "num_input_tokens_seen": 403414768, "step": 396200 }, { "epoch": 7.802563446279853, "grad_norm": 2.0289275646209717, "learning_rate": 4.683467043072861e-07, "loss": 1.9937, "num_input_tokens_seen": 403516520, "step": 396300 }, { "epoch": 7.804532299029356, "grad_norm": 2.0917770862579346, "learning_rate": 4.681409400923278e-07, "loss": 2.0014, "num_input_tokens_seen": 403617400, "step": 396400 }, { "epoch": 7.806501151778859, "grad_norm": 1.7936569452285767, "learning_rate": 4.6793518129473357e-07, "loss": 1.9794, "num_input_tokens_seen": 403719576, "step": 396500 }, { "epoch": 7.8084700045283615, "grad_norm": 1.9994819164276123, "learning_rate": 4.6772942794949144e-07, "loss": 1.9729, "num_input_tokens_seen": 403821376, "step": 396600 }, { "epoch": 7.810438857277864, "grad_norm": 2.0388219356536865, "learning_rate": 4.6752368009158803e-07, "loss": 1.9552, "num_input_tokens_seen": 403922984, "step": 396700 }, { "epoch": 7.812407710027367, "grad_norm": 1.730810284614563, "learning_rate": 4.673179377560088e-07, "loss": 1.9948, "num_input_tokens_seen": 404024592, "step": 396800 }, { "epoch": 7.81437656277687, "grad_norm": 2.40167236328125, "learning_rate": 4.671122009777387e-07, "loss": 2.0161, "num_input_tokens_seen": 404126120, "step": 396900 }, { "epoch": 7.8163454155263725, "grad_norm": 1.8635061979293823, "learning_rate": 4.669064697917613e-07, "loss": 1.9808, "num_input_tokens_seen": 404228520, "step": 397000 }, { "epoch": 7.818314268275875, "grad_norm": 2.0289618968963623, "learning_rate": 4.667007442330597e-07, "loss": 1.9991, "num_input_tokens_seen": 404330920, "step": 397100 }, { "epoch": 7.820283121025378, "grad_norm": 1.9724180698394775, "learning_rate": 4.6649502433661566e-07, "loss": 2.018, "num_input_tokens_seen": 404432512, "step": 397200 }, { "epoch": 7.822251973774882, "grad_norm": 1.9349591732025146, "learning_rate": 4.6628931013741015e-07, "loss": 1.9811, "num_input_tokens_seen": 404534912, "step": 397300 }, { "epoch": 7.824220826524384, "grad_norm": 1.8570616245269775, "learning_rate": 4.66083601670423e-07, "loss": 1.9727, "num_input_tokens_seen": 404636808, "step": 397400 }, { "epoch": 7.826189679273887, "grad_norm": 1.8789377212524414, "learning_rate": 4.658778989706335e-07, "loss": 1.9598, "num_input_tokens_seen": 404739208, "step": 397500 }, { "epoch": 7.82815853202339, "grad_norm": 2.2145214080810547, "learning_rate": 4.6567220207301933e-07, "loss": 1.9881, "num_input_tokens_seen": 404841608, "step": 397600 }, { "epoch": 7.830127384772893, "grad_norm": 1.8645763397216797, "learning_rate": 4.654665110125577e-07, "loss": 1.9751, "num_input_tokens_seen": 404942592, "step": 397700 }, { "epoch": 7.8320962375223955, "grad_norm": 1.8048200607299805, "learning_rate": 4.652608258242247e-07, "loss": 1.9832, "num_input_tokens_seen": 405044224, "step": 397800 }, { "epoch": 7.834065090271898, "grad_norm": 2.1996238231658936, "learning_rate": 4.6505514654299516e-07, "loss": 1.9799, "num_input_tokens_seen": 405146624, "step": 397900 }, { "epoch": 7.836033943021402, "grad_norm": 2.032247304916382, "learning_rate": 4.648494732038433e-07, "loss": 2.0608, "num_input_tokens_seen": 405248096, "step": 398000 }, { "epoch": 7.838002795770905, "grad_norm": 1.9830822944641113, "learning_rate": 4.646438058417419e-07, "loss": 2.0156, "num_input_tokens_seen": 405350312, "step": 398100 }, { "epoch": 7.839971648520407, "grad_norm": 6.491024971008301, "learning_rate": 4.644381444916633e-07, "loss": 1.9995, "num_input_tokens_seen": 405451952, "step": 398200 }, { "epoch": 7.84194050126991, "grad_norm": 2.343080759048462, "learning_rate": 4.642324891885781e-07, "loss": 1.9949, "num_input_tokens_seen": 405553912, "step": 398300 }, { "epoch": 7.843909354019413, "grad_norm": 1.8906807899475098, "learning_rate": 4.640268399674566e-07, "loss": 1.9698, "num_input_tokens_seen": 405656312, "step": 398400 }, { "epoch": 7.845878206768916, "grad_norm": 1.6502257585525513, "learning_rate": 4.638211968632674e-07, "loss": 1.9669, "num_input_tokens_seen": 405758712, "step": 398500 }, { "epoch": 7.847847059518418, "grad_norm": 2.343722343444824, "learning_rate": 4.6361555991097875e-07, "loss": 1.961, "num_input_tokens_seen": 405861112, "step": 398600 }, { "epoch": 7.849815912267921, "grad_norm": 1.8986138105392456, "learning_rate": 4.6340992914555705e-07, "loss": 1.9415, "num_input_tokens_seen": 405963512, "step": 398700 }, { "epoch": 7.851784765017424, "grad_norm": 7.905078411102295, "learning_rate": 4.632043046019681e-07, "loss": 1.9445, "num_input_tokens_seen": 406065912, "step": 398800 }, { "epoch": 7.853753617766928, "grad_norm": 1.8434330224990845, "learning_rate": 4.629986863151773e-07, "loss": 1.9951, "num_input_tokens_seen": 406167552, "step": 398900 }, { "epoch": 7.85572247051643, "grad_norm": 1.826326847076416, "learning_rate": 4.6279307432014734e-07, "loss": 2.0278, "num_input_tokens_seen": 406269176, "step": 399000 }, { "epoch": 7.857691323265933, "grad_norm": 1.8038502931594849, "learning_rate": 4.625874686518417e-07, "loss": 1.9805, "num_input_tokens_seen": 406371056, "step": 399100 }, { "epoch": 7.859660176015436, "grad_norm": 1.8817472457885742, "learning_rate": 4.6238186934522113e-07, "loss": 1.9664, "num_input_tokens_seen": 406472104, "step": 399200 }, { "epoch": 7.861629028764939, "grad_norm": 2.131619691848755, "learning_rate": 4.6217627643524663e-07, "loss": 1.9796, "num_input_tokens_seen": 406573320, "step": 399300 }, { "epoch": 7.863597881514441, "grad_norm": 1.8322391510009766, "learning_rate": 4.6197068995687704e-07, "loss": 1.9649, "num_input_tokens_seen": 406674712, "step": 399400 }, { "epoch": 7.865566734263944, "grad_norm": 2.3457887172698975, "learning_rate": 4.617651099450712e-07, "loss": 1.9471, "num_input_tokens_seen": 406777112, "step": 399500 }, { "epoch": 7.867535587013447, "grad_norm": 2.0252625942230225, "learning_rate": 4.6155953643478563e-07, "loss": 2.0067, "num_input_tokens_seen": 406879512, "step": 399600 }, { "epoch": 7.8695044397629506, "grad_norm": 1.6721338033676147, "learning_rate": 4.613539694609767e-07, "loss": 1.9832, "num_input_tokens_seen": 406981272, "step": 399700 }, { "epoch": 7.871473292512453, "grad_norm": 2.0794119834899902, "learning_rate": 4.611484090585994e-07, "loss": 1.9983, "num_input_tokens_seen": 407082968, "step": 399800 }, { "epoch": 7.873442145261956, "grad_norm": 1.8829747438430786, "learning_rate": 4.6094285526260745e-07, "loss": 1.9626, "num_input_tokens_seen": 407184656, "step": 399900 }, { "epoch": 7.875410998011459, "grad_norm": 2.0226032733917236, "learning_rate": 4.6073730810795357e-07, "loss": 1.9699, "num_input_tokens_seen": 407286960, "step": 400000 }, { "epoch": 7.877379850760962, "grad_norm": 1.9273746013641357, "learning_rate": 4.605317676295892e-07, "loss": 1.9637, "num_input_tokens_seen": 407389360, "step": 400100 }, { "epoch": 7.879348703510464, "grad_norm": 1.977658987045288, "learning_rate": 4.60326233862465e-07, "loss": 2.0162, "num_input_tokens_seen": 407490992, "step": 400200 }, { "epoch": 7.881317556259967, "grad_norm": 1.931991696357727, "learning_rate": 4.601207068415299e-07, "loss": 1.9887, "num_input_tokens_seen": 407592552, "step": 400300 }, { "epoch": 7.88328640900947, "grad_norm": 1.8200076818466187, "learning_rate": 4.5991518660173243e-07, "loss": 1.9524, "num_input_tokens_seen": 407694568, "step": 400400 }, { "epoch": 7.885255261758973, "grad_norm": 2.193967342376709, "learning_rate": 4.597096731780193e-07, "loss": 1.9761, "num_input_tokens_seen": 407796968, "step": 400500 }, { "epoch": 7.887224114508476, "grad_norm": 3.000343084335327, "learning_rate": 4.5950416660533647e-07, "loss": 1.93, "num_input_tokens_seen": 407899368, "step": 400600 }, { "epoch": 7.889192967257979, "grad_norm": 1.9227417707443237, "learning_rate": 4.5929866691862843e-07, "loss": 2.0151, "num_input_tokens_seen": 408001720, "step": 400700 }, { "epoch": 7.891161820007482, "grad_norm": 2.170828104019165, "learning_rate": 4.590931741528389e-07, "loss": 2.0049, "num_input_tokens_seen": 408102688, "step": 400800 }, { "epoch": 7.8931306727569845, "grad_norm": 2.095761775970459, "learning_rate": 4.588876883429101e-07, "loss": 2.0066, "num_input_tokens_seen": 408203800, "step": 400900 }, { "epoch": 7.895099525506487, "grad_norm": 2.010538339614868, "learning_rate": 4.58682209523783e-07, "loss": 1.9598, "num_input_tokens_seen": 408306200, "step": 401000 }, { "epoch": 7.89706837825599, "grad_norm": 2.081390619277954, "learning_rate": 4.5847673773039787e-07, "loss": 1.976, "num_input_tokens_seen": 408407960, "step": 401100 }, { "epoch": 7.899037231005493, "grad_norm": 1.9769562482833862, "learning_rate": 4.5827127299769316e-07, "loss": 2.0141, "num_input_tokens_seen": 408510272, "step": 401200 }, { "epoch": 7.901006083754996, "grad_norm": 1.8852051496505737, "learning_rate": 4.580658153606066e-07, "loss": 1.9648, "num_input_tokens_seen": 408612672, "step": 401300 }, { "epoch": 7.902974936504499, "grad_norm": 1.8560434579849243, "learning_rate": 4.5786036485407433e-07, "loss": 1.918, "num_input_tokens_seen": 408715072, "step": 401400 }, { "epoch": 7.904943789254002, "grad_norm": 1.6488037109375, "learning_rate": 4.5765492151303164e-07, "loss": 1.95, "num_input_tokens_seen": 408816776, "step": 401500 }, { "epoch": 7.906912642003505, "grad_norm": 1.9223612546920776, "learning_rate": 4.574494853724123e-07, "loss": 2.0605, "num_input_tokens_seen": 408917632, "step": 401600 }, { "epoch": 7.9088814947530075, "grad_norm": 1.851230263710022, "learning_rate": 4.572440564671491e-07, "loss": 1.9802, "num_input_tokens_seen": 409019168, "step": 401700 }, { "epoch": 7.91085034750251, "grad_norm": 1.7442927360534668, "learning_rate": 4.5703863483217376e-07, "loss": 1.9523, "num_input_tokens_seen": 409121568, "step": 401800 }, { "epoch": 7.912819200252013, "grad_norm": 1.6845791339874268, "learning_rate": 4.5683322050241577e-07, "loss": 1.9634, "num_input_tokens_seen": 409223392, "step": 401900 }, { "epoch": 7.914788053001516, "grad_norm": 2.3882217407226562, "learning_rate": 4.566278135128049e-07, "loss": 1.971, "num_input_tokens_seen": 409325792, "step": 402000 }, { "epoch": 7.9167569057510185, "grad_norm": 1.6533641815185547, "learning_rate": 4.5642241389826835e-07, "loss": 1.9921, "num_input_tokens_seen": 409426512, "step": 402100 }, { "epoch": 7.918725758500521, "grad_norm": 2.1795406341552734, "learning_rate": 4.562170216937329e-07, "loss": 1.9475, "num_input_tokens_seen": 409528912, "step": 402200 }, { "epoch": 7.920694611250025, "grad_norm": 1.9683079719543457, "learning_rate": 4.560116369341235e-07, "loss": 1.9598, "num_input_tokens_seen": 409630464, "step": 402300 }, { "epoch": 7.922663463999528, "grad_norm": 2.0511274337768555, "learning_rate": 4.558062596543644e-07, "loss": 2.0094, "num_input_tokens_seen": 409731408, "step": 402400 }, { "epoch": 7.9246323167490305, "grad_norm": 2.065965414047241, "learning_rate": 4.556008898893779e-07, "loss": 1.9937, "num_input_tokens_seen": 409833592, "step": 402500 }, { "epoch": 7.926601169498533, "grad_norm": 1.9457801580429077, "learning_rate": 4.5539552767408575e-07, "loss": 2.0166, "num_input_tokens_seen": 409935264, "step": 402600 }, { "epoch": 7.928570022248036, "grad_norm": 1.847275733947754, "learning_rate": 4.5519017304340777e-07, "loss": 2.023, "num_input_tokens_seen": 410036296, "step": 402700 }, { "epoch": 7.930538874997539, "grad_norm": 1.8351118564605713, "learning_rate": 4.5498482603226295e-07, "loss": 1.9832, "num_input_tokens_seen": 410138696, "step": 402800 }, { "epoch": 7.9325077277470415, "grad_norm": 1.958203673362732, "learning_rate": 4.5477948667556885e-07, "loss": 2.0532, "num_input_tokens_seen": 410238312, "step": 402900 }, { "epoch": 7.934476580496545, "grad_norm": 1.9280420541763306, "learning_rate": 4.545741550082416e-07, "loss": 2.0072, "num_input_tokens_seen": 410340712, "step": 403000 }, { "epoch": 7.936445433246048, "grad_norm": 1.8636482954025269, "learning_rate": 4.543688310651962e-07, "loss": 1.9968, "num_input_tokens_seen": 410443112, "step": 403100 }, { "epoch": 7.938414285995551, "grad_norm": 2.1279783248901367, "learning_rate": 4.541635148813461e-07, "loss": 1.9661, "num_input_tokens_seen": 410541816, "step": 403200 }, { "epoch": 7.940383138745053, "grad_norm": 1.7693428993225098, "learning_rate": 4.539582064916038e-07, "loss": 1.9378, "num_input_tokens_seen": 410643456, "step": 403300 }, { "epoch": 7.942351991494556, "grad_norm": 1.7903947830200195, "learning_rate": 4.537529059308801e-07, "loss": 2.0237, "num_input_tokens_seen": 410744504, "step": 403400 }, { "epoch": 7.944320844244059, "grad_norm": 1.720933198928833, "learning_rate": 4.5354761323408476e-07, "loss": 1.9494, "num_input_tokens_seen": 410846904, "step": 403500 }, { "epoch": 7.946289696993562, "grad_norm": 1.999996304512024, "learning_rate": 4.5334232843612587e-07, "loss": 1.9988, "num_input_tokens_seen": 410949304, "step": 403600 }, { "epoch": 7.9482585497430644, "grad_norm": 2.0352823734283447, "learning_rate": 4.531370515719106e-07, "loss": 2.0084, "num_input_tokens_seen": 411051272, "step": 403700 }, { "epoch": 7.950227402492567, "grad_norm": 1.7085314989089966, "learning_rate": 4.5293178267634435e-07, "loss": 1.9817, "num_input_tokens_seen": 411152936, "step": 403800 }, { "epoch": 7.95219625524207, "grad_norm": 1.8613543510437012, "learning_rate": 4.5272652178433137e-07, "loss": 2.0533, "num_input_tokens_seen": 411252824, "step": 403900 }, { "epoch": 7.954165107991574, "grad_norm": 1.9997822046279907, "learning_rate": 4.5252126893077494e-07, "loss": 1.9447, "num_input_tokens_seen": 411354408, "step": 404000 }, { "epoch": 7.956133960741076, "grad_norm": 1.9824864864349365, "learning_rate": 4.52316024150576e-07, "loss": 1.9597, "num_input_tokens_seen": 411456808, "step": 404100 }, { "epoch": 7.958102813490579, "grad_norm": 1.8737788200378418, "learning_rate": 4.521107874786352e-07, "loss": 2.0305, "num_input_tokens_seen": 411557928, "step": 404200 }, { "epoch": 7.960071666240082, "grad_norm": 1.9790064096450806, "learning_rate": 4.5190555894985084e-07, "loss": 2.0123, "num_input_tokens_seen": 411660328, "step": 404300 }, { "epoch": 7.962040518989585, "grad_norm": 2.1659984588623047, "learning_rate": 4.517003385991208e-07, "loss": 1.9605, "num_input_tokens_seen": 411762184, "step": 404400 }, { "epoch": 7.964009371739087, "grad_norm": 1.9814157485961914, "learning_rate": 4.5149512646134045e-07, "loss": 2.0205, "num_input_tokens_seen": 411863848, "step": 404500 }, { "epoch": 7.96597822448859, "grad_norm": 1.9131046533584595, "learning_rate": 4.5128992257140514e-07, "loss": 2.0331, "num_input_tokens_seen": 411965808, "step": 404600 }, { "epoch": 7.967947077238094, "grad_norm": 1.846489667892456, "learning_rate": 4.510847269642073e-07, "loss": 2.029, "num_input_tokens_seen": 412068208, "step": 404700 }, { "epoch": 7.969915929987597, "grad_norm": 1.982517123222351, "learning_rate": 4.508795396746392e-07, "loss": 2.0108, "num_input_tokens_seen": 412169984, "step": 404800 }, { "epoch": 7.971884782737099, "grad_norm": 1.8219513893127441, "learning_rate": 4.5067436073759114e-07, "loss": 1.9365, "num_input_tokens_seen": 412272384, "step": 404900 }, { "epoch": 7.973853635486602, "grad_norm": 1.8535133600234985, "learning_rate": 4.504691901879519e-07, "loss": 1.999, "num_input_tokens_seen": 412374352, "step": 405000 }, { "epoch": 7.975822488236105, "grad_norm": 2.036672830581665, "learning_rate": 4.5026402806060924e-07, "loss": 1.9979, "num_input_tokens_seen": 412475384, "step": 405100 }, { "epoch": 7.977791340985608, "grad_norm": 2.0089473724365234, "learning_rate": 4.5005887439044904e-07, "loss": 2.0108, "num_input_tokens_seen": 412577784, "step": 405200 }, { "epoch": 7.97976019373511, "grad_norm": 3.8883466720581055, "learning_rate": 4.498537292123561e-07, "loss": 1.9855, "num_input_tokens_seen": 412678760, "step": 405300 }, { "epoch": 7.981729046484613, "grad_norm": 1.8637185096740723, "learning_rate": 4.4964859256121347e-07, "loss": 2.001, "num_input_tokens_seen": 412781160, "step": 405400 }, { "epoch": 7.983697899234116, "grad_norm": 3.9107072353363037, "learning_rate": 4.4944346447190307e-07, "loss": 1.9783, "num_input_tokens_seen": 412883560, "step": 405500 }, { "epoch": 7.9856667519836195, "grad_norm": 2.056428909301758, "learning_rate": 4.49238344979305e-07, "loss": 1.979, "num_input_tokens_seen": 412985912, "step": 405600 }, { "epoch": 7.987635604733122, "grad_norm": 1.8555965423583984, "learning_rate": 4.4903323411829834e-07, "loss": 1.9656, "num_input_tokens_seen": 413088264, "step": 405700 }, { "epoch": 7.989604457482625, "grad_norm": 1.9894907474517822, "learning_rate": 4.4882813192376024e-07, "loss": 1.9573, "num_input_tokens_seen": 413190080, "step": 405800 }, { "epoch": 7.991573310232128, "grad_norm": 1.6447981595993042, "learning_rate": 4.486230384305667e-07, "loss": 1.9908, "num_input_tokens_seen": 413291752, "step": 405900 }, { "epoch": 7.993542162981631, "grad_norm": 1.8621642589569092, "learning_rate": 4.4841795367359223e-07, "loss": 1.976, "num_input_tokens_seen": 413393568, "step": 406000 }, { "epoch": 7.995511015731133, "grad_norm": 1.8075709342956543, "learning_rate": 4.4821287768770967e-07, "loss": 1.9935, "num_input_tokens_seen": 413495424, "step": 406100 }, { "epoch": 7.997479868480636, "grad_norm": 1.8301723003387451, "learning_rate": 4.480078105077905e-07, "loss": 1.9843, "num_input_tokens_seen": 413597824, "step": 406200 }, { "epoch": 7.999448721230139, "grad_norm": 2.453486680984497, "learning_rate": 4.478027521687045e-07, "loss": 2.0093, "num_input_tokens_seen": 413700224, "step": 406300 }, { "epoch": 8.001417573979642, "grad_norm": 1.8768980503082275, "learning_rate": 4.475977027053204e-07, "loss": 2.0736, "num_input_tokens_seen": 413802624, "step": 406400 }, { "epoch": 8.003386426729145, "grad_norm": 2.593552350997925, "learning_rate": 4.473926621525048e-07, "loss": 1.9937, "num_input_tokens_seen": 413904352, "step": 406500 }, { "epoch": 8.005355279478648, "grad_norm": 1.6389186382293701, "learning_rate": 4.471876305451234e-07, "loss": 1.9593, "num_input_tokens_seen": 414006752, "step": 406600 }, { "epoch": 8.00732413222815, "grad_norm": 2.015887498855591, "learning_rate": 4.4698260791804e-07, "loss": 1.9714, "num_input_tokens_seen": 414108384, "step": 406700 }, { "epoch": 8.009292984977654, "grad_norm": 1.7495338916778564, "learning_rate": 4.4677759430611674e-07, "loss": 2.0084, "num_input_tokens_seen": 414210496, "step": 406800 }, { "epoch": 8.011261837727156, "grad_norm": 1.7535310983657837, "learning_rate": 4.46572589744215e-07, "loss": 1.9955, "num_input_tokens_seen": 414312808, "step": 406900 }, { "epoch": 8.013230690476659, "grad_norm": 2.2229084968566895, "learning_rate": 4.4636759426719347e-07, "loss": 1.954, "num_input_tokens_seen": 414414488, "step": 407000 }, { "epoch": 8.015199543226162, "grad_norm": 2.1934497356414795, "learning_rate": 4.461626079099104e-07, "loss": 1.9182, "num_input_tokens_seen": 414516888, "step": 407100 }, { "epoch": 8.017168395975665, "grad_norm": 1.7163622379302979, "learning_rate": 4.459576307072215e-07, "loss": 1.9731, "num_input_tokens_seen": 414617696, "step": 407200 }, { "epoch": 8.019137248725167, "grad_norm": 1.955451250076294, "learning_rate": 4.4575266269398203e-07, "loss": 1.9797, "num_input_tokens_seen": 414719648, "step": 407300 }, { "epoch": 8.02110610147467, "grad_norm": 2.0064711570739746, "learning_rate": 4.4554770390504443e-07, "loss": 1.971, "num_input_tokens_seen": 414821432, "step": 407400 }, { "epoch": 8.023074954224173, "grad_norm": 1.9356880187988281, "learning_rate": 4.453427543752608e-07, "loss": 2.0352, "num_input_tokens_seen": 414922488, "step": 407500 }, { "epoch": 8.025043806973676, "grad_norm": 1.8024967908859253, "learning_rate": 4.451378141394805e-07, "loss": 1.9791, "num_input_tokens_seen": 415024888, "step": 407600 }, { "epoch": 8.02701265972318, "grad_norm": 1.9446879625320435, "learning_rate": 4.449328832325524e-07, "loss": 1.9586, "num_input_tokens_seen": 415126456, "step": 407700 }, { "epoch": 8.028981512472683, "grad_norm": 1.8077514171600342, "learning_rate": 4.447279616893228e-07, "loss": 1.9698, "num_input_tokens_seen": 415228856, "step": 407800 }, { "epoch": 8.030950365222186, "grad_norm": 1.8537914752960205, "learning_rate": 4.445230495446372e-07, "loss": 2.0053, "num_input_tokens_seen": 415331256, "step": 407900 }, { "epoch": 8.032919217971688, "grad_norm": 2.114763021469116, "learning_rate": 4.443181468333392e-07, "loss": 2.045, "num_input_tokens_seen": 415431368, "step": 408000 }, { "epoch": 8.034888070721191, "grad_norm": 1.8701435327529907, "learning_rate": 4.441132535902707e-07, "loss": 1.9582, "num_input_tokens_seen": 415533312, "step": 408100 }, { "epoch": 8.036856923470694, "grad_norm": 2.1332123279571533, "learning_rate": 4.439083698502721e-07, "loss": 1.9816, "num_input_tokens_seen": 415635712, "step": 408200 }, { "epoch": 8.038825776220197, "grad_norm": 1.9398605823516846, "learning_rate": 4.4370349564818197e-07, "loss": 1.9609, "num_input_tokens_seen": 415736664, "step": 408300 }, { "epoch": 8.0407946289697, "grad_norm": 1.9261415004730225, "learning_rate": 4.4349863101883767e-07, "loss": 1.9643, "num_input_tokens_seen": 415839064, "step": 408400 }, { "epoch": 8.042763481719202, "grad_norm": 2.070559501647949, "learning_rate": 4.432937759970746e-07, "loss": 2.0263, "num_input_tokens_seen": 415940552, "step": 408500 }, { "epoch": 8.044732334468705, "grad_norm": 1.7003812789916992, "learning_rate": 4.430889306177267e-07, "loss": 1.9846, "num_input_tokens_seen": 416042952, "step": 408600 }, { "epoch": 8.046701187218208, "grad_norm": 1.6576703786849976, "learning_rate": 4.42884094915626e-07, "loss": 1.9659, "num_input_tokens_seen": 416143776, "step": 408700 }, { "epoch": 8.04867003996771, "grad_norm": 1.9099196195602417, "learning_rate": 4.4267926892560334e-07, "loss": 2.0357, "num_input_tokens_seen": 416244952, "step": 408800 }, { "epoch": 8.050638892717213, "grad_norm": 1.7863417863845825, "learning_rate": 4.424744526824874e-07, "loss": 1.9849, "num_input_tokens_seen": 416347352, "step": 408900 }, { "epoch": 8.052607745466716, "grad_norm": 1.963246464729309, "learning_rate": 4.422696462211056e-07, "loss": 2.0101, "num_input_tokens_seen": 416449752, "step": 409000 }, { "epoch": 8.054576598216219, "grad_norm": 1.973564624786377, "learning_rate": 4.4206484957628367e-07, "loss": 1.9676, "num_input_tokens_seen": 416551520, "step": 409100 }, { "epoch": 8.056545450965721, "grad_norm": 1.9066550731658936, "learning_rate": 4.4186006278284535e-07, "loss": 2.0223, "num_input_tokens_seen": 416652472, "step": 409200 }, { "epoch": 8.058514303715224, "grad_norm": 1.968870997428894, "learning_rate": 4.41655285875613e-07, "loss": 1.9441, "num_input_tokens_seen": 416754208, "step": 409300 }, { "epoch": 8.060483156464729, "grad_norm": 1.8548153638839722, "learning_rate": 4.4145051888940715e-07, "loss": 1.9885, "num_input_tokens_seen": 416856608, "step": 409400 }, { "epoch": 8.062452009214232, "grad_norm": 1.7558403015136719, "learning_rate": 4.412457618590468e-07, "loss": 1.9663, "num_input_tokens_seen": 416959008, "step": 409500 }, { "epoch": 8.064420861963734, "grad_norm": 2.3564465045928955, "learning_rate": 4.4104101481934896e-07, "loss": 2.0136, "num_input_tokens_seen": 417060840, "step": 409600 }, { "epoch": 8.066389714713237, "grad_norm": 1.9404773712158203, "learning_rate": 4.4083627780512945e-07, "loss": 1.9905, "num_input_tokens_seen": 417162384, "step": 409700 }, { "epoch": 8.06835856746274, "grad_norm": 1.9789031744003296, "learning_rate": 4.4063155085120173e-07, "loss": 1.9415, "num_input_tokens_seen": 417264280, "step": 409800 }, { "epoch": 8.070327420212243, "grad_norm": 2.336280345916748, "learning_rate": 4.4042683399237796e-07, "loss": 2.0524, "num_input_tokens_seen": 417363904, "step": 409900 }, { "epoch": 8.072296272961745, "grad_norm": 1.9985854625701904, "learning_rate": 4.402221272634689e-07, "loss": 1.9839, "num_input_tokens_seen": 417466304, "step": 410000 }, { "epoch": 8.074265125711248, "grad_norm": 1.961057186126709, "learning_rate": 4.4001743069928265e-07, "loss": 1.9513, "num_input_tokens_seen": 417567960, "step": 410100 }, { "epoch": 8.07623397846075, "grad_norm": 3.8790998458862305, "learning_rate": 4.398127443346267e-07, "loss": 2.0032, "num_input_tokens_seen": 417670360, "step": 410200 }, { "epoch": 8.078202831210254, "grad_norm": 1.961729884147644, "learning_rate": 4.3960806820430555e-07, "loss": 1.9924, "num_input_tokens_seen": 417772224, "step": 410300 }, { "epoch": 8.080171683959756, "grad_norm": 1.9560363292694092, "learning_rate": 4.394034023431233e-07, "loss": 2.0176, "num_input_tokens_seen": 417874624, "step": 410400 }, { "epoch": 8.08214053670926, "grad_norm": 1.913643479347229, "learning_rate": 4.3919874678588126e-07, "loss": 1.9908, "num_input_tokens_seen": 417976104, "step": 410500 }, { "epoch": 8.084109389458762, "grad_norm": 1.811996579170227, "learning_rate": 4.389941015673796e-07, "loss": 1.9808, "num_input_tokens_seen": 418077208, "step": 410600 }, { "epoch": 8.086078242208265, "grad_norm": 2.0247504711151123, "learning_rate": 4.3878946672241634e-07, "loss": 2.0238, "num_input_tokens_seen": 418179608, "step": 410700 }, { "epoch": 8.088047094957767, "grad_norm": 1.8828580379486084, "learning_rate": 4.385848422857881e-07, "loss": 1.9857, "num_input_tokens_seen": 418281152, "step": 410800 }, { "epoch": 8.09001594770727, "grad_norm": 2.2555432319641113, "learning_rate": 4.3838022829228933e-07, "loss": 2.0139, "num_input_tokens_seen": 418383552, "step": 410900 }, { "epoch": 8.091984800456775, "grad_norm": 1.7268421649932861, "learning_rate": 4.38175624776713e-07, "loss": 2.0021, "num_input_tokens_seen": 418485144, "step": 411000 }, { "epoch": 8.093953653206277, "grad_norm": 1.8772838115692139, "learning_rate": 4.379710317738503e-07, "loss": 1.9899, "num_input_tokens_seen": 418586160, "step": 411100 }, { "epoch": 8.09592250595578, "grad_norm": 1.8884308338165283, "learning_rate": 4.3776644931849044e-07, "loss": 1.9949, "num_input_tokens_seen": 418687712, "step": 411200 }, { "epoch": 8.097891358705283, "grad_norm": 1.8476943969726562, "learning_rate": 4.37561877445421e-07, "loss": 1.9742, "num_input_tokens_seen": 418790112, "step": 411300 }, { "epoch": 8.099860211454786, "grad_norm": 2.120741128921509, "learning_rate": 4.3735731618942764e-07, "loss": 1.9978, "num_input_tokens_seen": 418892064, "step": 411400 }, { "epoch": 8.101829064204288, "grad_norm": 1.7202106714248657, "learning_rate": 4.3715276558529436e-07, "loss": 1.9588, "num_input_tokens_seen": 418993728, "step": 411500 }, { "epoch": 8.103797916953791, "grad_norm": 1.9934120178222656, "learning_rate": 4.369482256678031e-07, "loss": 1.9947, "num_input_tokens_seen": 419094808, "step": 411600 }, { "epoch": 8.105766769703294, "grad_norm": 2.1794073581695557, "learning_rate": 4.3674369647173435e-07, "loss": 1.9949, "num_input_tokens_seen": 419196312, "step": 411700 }, { "epoch": 8.107735622452797, "grad_norm": 1.9189203977584839, "learning_rate": 4.3653917803186644e-07, "loss": 1.9819, "num_input_tokens_seen": 419297152, "step": 411800 }, { "epoch": 8.1097044752023, "grad_norm": 2.280561685562134, "learning_rate": 4.3633467038297604e-07, "loss": 1.992, "num_input_tokens_seen": 419399040, "step": 411900 }, { "epoch": 8.111673327951802, "grad_norm": 1.8303940296173096, "learning_rate": 4.3613017355983803e-07, "loss": 1.9362, "num_input_tokens_seen": 419500592, "step": 412000 }, { "epoch": 8.113642180701305, "grad_norm": 1.8243738412857056, "learning_rate": 4.3592568759722513e-07, "loss": 2.0041, "num_input_tokens_seen": 419602416, "step": 412100 }, { "epoch": 8.115611033450808, "grad_norm": 2.0194602012634277, "learning_rate": 4.35721212529909e-07, "loss": 1.9372, "num_input_tokens_seen": 419704816, "step": 412200 }, { "epoch": 8.11757988620031, "grad_norm": 1.8387428522109985, "learning_rate": 4.3551674839265814e-07, "loss": 1.9939, "num_input_tokens_seen": 419807216, "step": 412300 }, { "epoch": 8.119548738949813, "grad_norm": 1.7517679929733276, "learning_rate": 4.3531229522024065e-07, "loss": 1.9665, "num_input_tokens_seen": 419908792, "step": 412400 }, { "epoch": 8.121517591699316, "grad_norm": 1.9406459331512451, "learning_rate": 4.351078530474215e-07, "loss": 1.9767, "num_input_tokens_seen": 420010400, "step": 412500 }, { "epoch": 8.123486444448819, "grad_norm": 1.822887659072876, "learning_rate": 4.3490342190896485e-07, "loss": 1.955, "num_input_tokens_seen": 420112136, "step": 412600 }, { "epoch": 8.125455297198323, "grad_norm": 2.7356631755828857, "learning_rate": 4.346990018396319e-07, "loss": 1.9891, "num_input_tokens_seen": 420213672, "step": 412700 }, { "epoch": 8.127424149947826, "grad_norm": 1.9019370079040527, "learning_rate": 4.344945928741834e-07, "loss": 1.9666, "num_input_tokens_seen": 420314440, "step": 412800 }, { "epoch": 8.129393002697329, "grad_norm": 1.843346118927002, "learning_rate": 4.3429019504737643e-07, "loss": 1.9398, "num_input_tokens_seen": 420416840, "step": 412900 }, { "epoch": 8.131361855446832, "grad_norm": 1.9066869020462036, "learning_rate": 4.340858083939677e-07, "loss": 1.9923, "num_input_tokens_seen": 420518608, "step": 413000 }, { "epoch": 8.133330708196334, "grad_norm": 2.1369450092315674, "learning_rate": 4.3388143294871144e-07, "loss": 1.9991, "num_input_tokens_seen": 420619736, "step": 413100 }, { "epoch": 8.135299560945837, "grad_norm": 1.8550385236740112, "learning_rate": 4.3367706874635975e-07, "loss": 1.9679, "num_input_tokens_seen": 420722136, "step": 413200 }, { "epoch": 8.13726841369534, "grad_norm": 1.9507849216461182, "learning_rate": 4.334727158216632e-07, "loss": 1.9271, "num_input_tokens_seen": 420823080, "step": 413300 }, { "epoch": 8.139237266444843, "grad_norm": 2.0656447410583496, "learning_rate": 4.3326837420937007e-07, "loss": 2.0056, "num_input_tokens_seen": 420925480, "step": 413400 }, { "epoch": 8.141206119194345, "grad_norm": 2.0361788272857666, "learning_rate": 4.3306404394422716e-07, "loss": 1.9977, "num_input_tokens_seen": 421026864, "step": 413500 }, { "epoch": 8.143174971943848, "grad_norm": 1.8786455392837524, "learning_rate": 4.32859725060979e-07, "loss": 1.9456, "num_input_tokens_seen": 421127064, "step": 413600 }, { "epoch": 8.145143824693351, "grad_norm": 2.021986961364746, "learning_rate": 4.326554175943684e-07, "loss": 1.9837, "num_input_tokens_seen": 421228704, "step": 413700 }, { "epoch": 8.147112677442854, "grad_norm": 1.950090765953064, "learning_rate": 4.3245112157913583e-07, "loss": 1.9887, "num_input_tokens_seen": 421331104, "step": 413800 }, { "epoch": 8.149081530192356, "grad_norm": 1.8825485706329346, "learning_rate": 4.3224683705002046e-07, "loss": 1.9525, "num_input_tokens_seen": 421433128, "step": 413900 }, { "epoch": 8.15105038294186, "grad_norm": 1.9467074871063232, "learning_rate": 4.3204256404175886e-07, "loss": 2.0093, "num_input_tokens_seen": 421534480, "step": 414000 }, { "epoch": 8.153019235691362, "grad_norm": 1.6391477584838867, "learning_rate": 4.318383025890862e-07, "loss": 1.9754, "num_input_tokens_seen": 421636880, "step": 414100 }, { "epoch": 8.154988088440865, "grad_norm": 1.941521167755127, "learning_rate": 4.316340527267352e-07, "loss": 1.9348, "num_input_tokens_seen": 421739280, "step": 414200 }, { "epoch": 8.156956941190368, "grad_norm": 2.1100356578826904, "learning_rate": 4.3142981448943687e-07, "loss": 1.9707, "num_input_tokens_seen": 421841680, "step": 414300 }, { "epoch": 8.158925793939872, "grad_norm": 1.9965530633926392, "learning_rate": 4.312255879119204e-07, "loss": 1.9807, "num_input_tokens_seen": 421943600, "step": 414400 }, { "epoch": 8.160894646689375, "grad_norm": 1.5953651666641235, "learning_rate": 4.310213730289125e-07, "loss": 2.0153, "num_input_tokens_seen": 422045392, "step": 414500 }, { "epoch": 8.162863499438878, "grad_norm": 1.9872205257415771, "learning_rate": 4.308171698751384e-07, "loss": 1.9624, "num_input_tokens_seen": 422147040, "step": 414600 }, { "epoch": 8.16483235218838, "grad_norm": 2.0076491832733154, "learning_rate": 4.3061297848532094e-07, "loss": 1.9825, "num_input_tokens_seen": 422248016, "step": 414700 }, { "epoch": 8.166801204937883, "grad_norm": 1.7215479612350464, "learning_rate": 4.3040879889418143e-07, "loss": 1.9748, "num_input_tokens_seen": 422349664, "step": 414800 }, { "epoch": 8.168770057687386, "grad_norm": 2.028359889984131, "learning_rate": 4.302046311364385e-07, "loss": 2.0006, "num_input_tokens_seen": 422452064, "step": 414900 }, { "epoch": 8.170738910436889, "grad_norm": 2.7671148777008057, "learning_rate": 4.3000047524680935e-07, "loss": 1.9735, "num_input_tokens_seen": 422552784, "step": 415000 }, { "epoch": 8.172707763186391, "grad_norm": 1.802409291267395, "learning_rate": 4.2979633126000923e-07, "loss": 2.003, "num_input_tokens_seen": 422655184, "step": 415100 }, { "epoch": 8.174676615935894, "grad_norm": 2.103318452835083, "learning_rate": 4.295921992107506e-07, "loss": 2.0106, "num_input_tokens_seen": 422756176, "step": 415200 }, { "epoch": 8.176645468685397, "grad_norm": 2.1737442016601562, "learning_rate": 4.2938807913374494e-07, "loss": 1.9747, "num_input_tokens_seen": 422858040, "step": 415300 }, { "epoch": 8.1786143214349, "grad_norm": 11.40088176727295, "learning_rate": 4.2918397106370047e-07, "loss": 1.9635, "num_input_tokens_seen": 422959528, "step": 415400 }, { "epoch": 8.180583174184402, "grad_norm": 1.9554022550582886, "learning_rate": 4.2897987503532486e-07, "loss": 2.0448, "num_input_tokens_seen": 423061928, "step": 415500 }, { "epoch": 8.182552026933905, "grad_norm": 1.760983943939209, "learning_rate": 4.287757910833221e-07, "loss": 1.977, "num_input_tokens_seen": 423164328, "step": 415600 }, { "epoch": 8.184520879683408, "grad_norm": 2.022050619125366, "learning_rate": 4.2857171924239565e-07, "loss": 1.9657, "num_input_tokens_seen": 423265456, "step": 415700 }, { "epoch": 8.18648973243291, "grad_norm": 2.083146810531616, "learning_rate": 4.283676595472456e-07, "loss": 1.9946, "num_input_tokens_seen": 423367856, "step": 415800 }, { "epoch": 8.188458585182413, "grad_norm": 1.9051077365875244, "learning_rate": 4.281636120325712e-07, "loss": 2.0006, "num_input_tokens_seen": 423470256, "step": 415900 }, { "epoch": 8.190427437931916, "grad_norm": 2.152902126312256, "learning_rate": 4.2795957673306824e-07, "loss": 1.9659, "num_input_tokens_seen": 423571888, "step": 416000 }, { "epoch": 8.19239629068142, "grad_norm": 1.9484357833862305, "learning_rate": 4.2775555368343184e-07, "loss": 1.9503, "num_input_tokens_seen": 423673592, "step": 416100 }, { "epoch": 8.194365143430923, "grad_norm": 1.811044454574585, "learning_rate": 4.2755154291835436e-07, "loss": 1.9951, "num_input_tokens_seen": 423775992, "step": 416200 }, { "epoch": 8.196333996180426, "grad_norm": 2.6572468280792236, "learning_rate": 4.273475444725258e-07, "loss": 1.9828, "num_input_tokens_seen": 423877600, "step": 416300 }, { "epoch": 8.198302848929929, "grad_norm": 1.873489260673523, "learning_rate": 4.2714355838063457e-07, "loss": 2.0001, "num_input_tokens_seen": 423979040, "step": 416400 }, { "epoch": 8.200271701679432, "grad_norm": 1.9626555442810059, "learning_rate": 4.269395846773667e-07, "loss": 1.9072, "num_input_tokens_seen": 424081440, "step": 416500 }, { "epoch": 8.202240554428935, "grad_norm": 1.839101791381836, "learning_rate": 4.2673562339740634e-07, "loss": 1.9907, "num_input_tokens_seen": 424182984, "step": 416600 }, { "epoch": 8.204209407178437, "grad_norm": 21.5980167388916, "learning_rate": 4.2653167457543515e-07, "loss": 2.0106, "num_input_tokens_seen": 424284424, "step": 416700 }, { "epoch": 8.20617825992794, "grad_norm": 2.051440715789795, "learning_rate": 4.2632773824613314e-07, "loss": 2.037, "num_input_tokens_seen": 424386312, "step": 416800 }, { "epoch": 8.208147112677443, "grad_norm": 2.139739751815796, "learning_rate": 4.261238144441778e-07, "loss": 1.9819, "num_input_tokens_seen": 424487896, "step": 416900 }, { "epoch": 8.210115965426946, "grad_norm": 2.032843828201294, "learning_rate": 4.2591990320424464e-07, "loss": 1.9742, "num_input_tokens_seen": 424590296, "step": 417000 }, { "epoch": 8.212084818176448, "grad_norm": 2.0535480976104736, "learning_rate": 4.2571600456100727e-07, "loss": 2.0141, "num_input_tokens_seen": 424692696, "step": 417100 }, { "epoch": 8.214053670925951, "grad_norm": 1.7785221338272095, "learning_rate": 4.2551211854913666e-07, "loss": 1.9624, "num_input_tokens_seen": 424795096, "step": 417200 }, { "epoch": 8.216022523675454, "grad_norm": 1.574901819229126, "learning_rate": 4.253082452033022e-07, "loss": 1.9824, "num_input_tokens_seen": 424897496, "step": 417300 }, { "epoch": 8.217991376424957, "grad_norm": 2.0116829872131348, "learning_rate": 4.2510438455817045e-07, "loss": 2.01, "num_input_tokens_seen": 424998560, "step": 417400 }, { "epoch": 8.21996022917446, "grad_norm": 1.813637614250183, "learning_rate": 4.2490053664840657e-07, "loss": 2.0074, "num_input_tokens_seen": 425100960, "step": 417500 }, { "epoch": 8.221929081923962, "grad_norm": 1.6019140481948853, "learning_rate": 4.246967015086729e-07, "loss": 1.9644, "num_input_tokens_seen": 425202240, "step": 417600 }, { "epoch": 8.223897934673467, "grad_norm": 1.9128168821334839, "learning_rate": 4.2449287917363013e-07, "loss": 2.0033, "num_input_tokens_seen": 425303872, "step": 417700 }, { "epoch": 8.22586678742297, "grad_norm": 2.0122196674346924, "learning_rate": 4.2428906967793627e-07, "loss": 1.9778, "num_input_tokens_seen": 425406272, "step": 417800 }, { "epoch": 8.227835640172472, "grad_norm": 1.946831226348877, "learning_rate": 4.2408527305624763e-07, "loss": 1.9131, "num_input_tokens_seen": 425508672, "step": 417900 }, { "epoch": 8.229804492921975, "grad_norm": 2.8963088989257812, "learning_rate": 4.2388148934321797e-07, "loss": 1.9759, "num_input_tokens_seen": 425611072, "step": 418000 }, { "epoch": 8.231773345671478, "grad_norm": 2.354146718978882, "learning_rate": 4.236777185734989e-07, "loss": 1.9959, "num_input_tokens_seen": 425711696, "step": 418100 }, { "epoch": 8.23374219842098, "grad_norm": 2.2027013301849365, "learning_rate": 4.2347396078174035e-07, "loss": 1.981, "num_input_tokens_seen": 425813368, "step": 418200 }, { "epoch": 8.235711051170483, "grad_norm": 1.856339693069458, "learning_rate": 4.23270216002589e-07, "loss": 1.9627, "num_input_tokens_seen": 425915768, "step": 418300 }, { "epoch": 8.237679903919986, "grad_norm": 2.4227607250213623, "learning_rate": 4.230664842706906e-07, "loss": 1.949, "num_input_tokens_seen": 426018168, "step": 418400 }, { "epoch": 8.239648756669489, "grad_norm": 3.0098276138305664, "learning_rate": 4.228627656206873e-07, "loss": 1.9839, "num_input_tokens_seen": 426120472, "step": 418500 }, { "epoch": 8.241617609418991, "grad_norm": 3.137199640274048, "learning_rate": 4.226590600872204e-07, "loss": 1.9741, "num_input_tokens_seen": 426222872, "step": 418600 }, { "epoch": 8.243586462168494, "grad_norm": 2.072751045227051, "learning_rate": 4.224553677049277e-07, "loss": 2.0482, "num_input_tokens_seen": 426323976, "step": 418700 }, { "epoch": 8.245555314917997, "grad_norm": 2.0092689990997314, "learning_rate": 4.222516885084459e-07, "loss": 2.0172, "num_input_tokens_seen": 426425728, "step": 418800 }, { "epoch": 8.2475241676675, "grad_norm": 1.9350968599319458, "learning_rate": 4.2204802253240867e-07, "loss": 2.0268, "num_input_tokens_seen": 426528128, "step": 418900 }, { "epoch": 8.249493020417002, "grad_norm": 1.8077954053878784, "learning_rate": 4.218443698114477e-07, "loss": 1.9851, "num_input_tokens_seen": 426629912, "step": 419000 }, { "epoch": 8.251461873166505, "grad_norm": 1.7765130996704102, "learning_rate": 4.2164073038019255e-07, "loss": 1.9909, "num_input_tokens_seen": 426730600, "step": 419100 }, { "epoch": 8.253430725916008, "grad_norm": 1.8612784147262573, "learning_rate": 4.2143710427327016e-07, "loss": 1.9292, "num_input_tokens_seen": 426832208, "step": 419200 }, { "epoch": 8.25539957866551, "grad_norm": 1.9122129678726196, "learning_rate": 4.2123349152530567e-07, "loss": 2.0009, "num_input_tokens_seen": 426934008, "step": 419300 }, { "epoch": 8.257368431415015, "grad_norm": 1.730830192565918, "learning_rate": 4.2102989217092155e-07, "loss": 1.9699, "num_input_tokens_seen": 427036408, "step": 419400 }, { "epoch": 8.259337284164518, "grad_norm": 2.005462646484375, "learning_rate": 4.2082630624473824e-07, "loss": 1.9936, "num_input_tokens_seen": 427138808, "step": 419500 }, { "epoch": 8.26130613691402, "grad_norm": 1.696393370628357, "learning_rate": 4.206227337813737e-07, "loss": 1.9563, "num_input_tokens_seen": 427241208, "step": 419600 }, { "epoch": 8.263274989663524, "grad_norm": 1.833376407623291, "learning_rate": 4.204191748154439e-07, "loss": 1.9736, "num_input_tokens_seen": 427342240, "step": 419700 }, { "epoch": 8.265243842413026, "grad_norm": 2.1345181465148926, "learning_rate": 4.2021562938156204e-07, "loss": 1.9145, "num_input_tokens_seen": 427444640, "step": 419800 }, { "epoch": 8.267212695162529, "grad_norm": 2.0645036697387695, "learning_rate": 4.200120975143396e-07, "loss": 2.0103, "num_input_tokens_seen": 427546264, "step": 419900 }, { "epoch": 8.269181547912032, "grad_norm": 1.9077825546264648, "learning_rate": 4.198085792483852e-07, "loss": 2.0053, "num_input_tokens_seen": 427645224, "step": 420000 }, { "epoch": 8.271150400661535, "grad_norm": 1.7693476676940918, "learning_rate": 4.196050746183056e-07, "loss": 1.9874, "num_input_tokens_seen": 427747056, "step": 420100 }, { "epoch": 8.273119253411037, "grad_norm": 2.262269973754883, "learning_rate": 4.1940158365870505e-07, "loss": 2.0144, "num_input_tokens_seen": 427847912, "step": 420200 }, { "epoch": 8.27508810616054, "grad_norm": 2.2255208492279053, "learning_rate": 4.191981064041852e-07, "loss": 1.9802, "num_input_tokens_seen": 427948560, "step": 420300 }, { "epoch": 8.277056958910043, "grad_norm": 1.746009349822998, "learning_rate": 4.189946428893461e-07, "loss": 1.9744, "num_input_tokens_seen": 428050336, "step": 420400 }, { "epoch": 8.279025811659546, "grad_norm": 2.0518009662628174, "learning_rate": 4.1879119314878445e-07, "loss": 1.9827, "num_input_tokens_seen": 428151904, "step": 420500 }, { "epoch": 8.280994664409048, "grad_norm": 2.1446449756622314, "learning_rate": 4.185877572170958e-07, "loss": 2.0161, "num_input_tokens_seen": 428254304, "step": 420600 }, { "epoch": 8.282963517158551, "grad_norm": 2.2428677082061768, "learning_rate": 4.1838433512887206e-07, "loss": 2.0101, "num_input_tokens_seen": 428354696, "step": 420700 }, { "epoch": 8.284932369908054, "grad_norm": 1.8926867246627808, "learning_rate": 4.181809269187041e-07, "loss": 1.9907, "num_input_tokens_seen": 428457096, "step": 420800 }, { "epoch": 8.286901222657557, "grad_norm": 1.7421817779541016, "learning_rate": 4.1797753262117906e-07, "loss": 2.0325, "num_input_tokens_seen": 428559496, "step": 420900 }, { "epoch": 8.28887007540706, "grad_norm": 2.029924154281616, "learning_rate": 4.177741522708831e-07, "loss": 1.9524, "num_input_tokens_seen": 428661456, "step": 421000 }, { "epoch": 8.290838928156564, "grad_norm": 1.8868544101715088, "learning_rate": 4.1757078590239877e-07, "loss": 1.9485, "num_input_tokens_seen": 428763120, "step": 421100 }, { "epoch": 8.292807780906067, "grad_norm": 2.0351948738098145, "learning_rate": 4.173674335503071e-07, "loss": 1.9618, "num_input_tokens_seen": 428865056, "step": 421200 }, { "epoch": 8.29477663365557, "grad_norm": 2.06782865524292, "learning_rate": 4.171640952491865e-07, "loss": 1.959, "num_input_tokens_seen": 428967456, "step": 421300 }, { "epoch": 8.296745486405072, "grad_norm": 1.8675702810287476, "learning_rate": 4.169607710336127e-07, "loss": 2.0106, "num_input_tokens_seen": 429069320, "step": 421400 }, { "epoch": 8.298714339154575, "grad_norm": 2.083636522293091, "learning_rate": 4.167574609381596e-07, "loss": 2.0069, "num_input_tokens_seen": 429171720, "step": 421500 }, { "epoch": 8.300683191904078, "grad_norm": 2.0831527709960938, "learning_rate": 4.1655416499739805e-07, "loss": 1.9547, "num_input_tokens_seen": 429273488, "step": 421600 }, { "epoch": 8.30265204465358, "grad_norm": 1.8514652252197266, "learning_rate": 4.1635088324589697e-07, "loss": 1.9648, "num_input_tokens_seen": 429375128, "step": 421700 }, { "epoch": 8.304620897403083, "grad_norm": 2.076215982437134, "learning_rate": 4.161476157182226e-07, "loss": 2.0021, "num_input_tokens_seen": 429477528, "step": 421800 }, { "epoch": 8.306589750152586, "grad_norm": 1.8311114311218262, "learning_rate": 4.15944362448939e-07, "loss": 1.9753, "num_input_tokens_seen": 429579928, "step": 421900 }, { "epoch": 8.308558602902089, "grad_norm": 1.9638522863388062, "learning_rate": 4.157411234726075e-07, "loss": 2.026, "num_input_tokens_seen": 429682328, "step": 422000 }, { "epoch": 8.310527455651592, "grad_norm": 1.8673714399337769, "learning_rate": 4.155378988237873e-07, "loss": 1.9869, "num_input_tokens_seen": 429783424, "step": 422100 }, { "epoch": 8.312496308401094, "grad_norm": 1.8887158632278442, "learning_rate": 4.153346885370352e-07, "loss": 1.9707, "num_input_tokens_seen": 429885824, "step": 422200 }, { "epoch": 8.314465161150597, "grad_norm": 2.0292675495147705, "learning_rate": 4.15131492646905e-07, "loss": 1.9664, "num_input_tokens_seen": 429987752, "step": 422300 }, { "epoch": 8.3164340139001, "grad_norm": 1.9890892505645752, "learning_rate": 4.149283111879488e-07, "loss": 1.9944, "num_input_tokens_seen": 430089480, "step": 422400 }, { "epoch": 8.318402866649603, "grad_norm": 2.198776960372925, "learning_rate": 4.147251441947157e-07, "loss": 1.963, "num_input_tokens_seen": 430191200, "step": 422500 }, { "epoch": 8.320371719399105, "grad_norm": 1.9108864068984985, "learning_rate": 4.145219917017526e-07, "loss": 1.9883, "num_input_tokens_seen": 430293600, "step": 422600 }, { "epoch": 8.322340572148608, "grad_norm": 1.9309444427490234, "learning_rate": 4.1431885374360386e-07, "loss": 1.9865, "num_input_tokens_seen": 430393568, "step": 422700 }, { "epoch": 8.324309424898113, "grad_norm": 1.9458986520767212, "learning_rate": 4.141157303548114e-07, "loss": 1.9893, "num_input_tokens_seen": 430495968, "step": 422800 }, { "epoch": 8.326278277647615, "grad_norm": 2.189464807510376, "learning_rate": 4.1391262156991454e-07, "loss": 1.9625, "num_input_tokens_seen": 430598368, "step": 422900 }, { "epoch": 8.328247130397118, "grad_norm": 1.830135464668274, "learning_rate": 4.137095274234504e-07, "loss": 1.9965, "num_input_tokens_seen": 430699680, "step": 423000 }, { "epoch": 8.330215983146621, "grad_norm": 2.2409379482269287, "learning_rate": 4.1350644794995315e-07, "loss": 2.0005, "num_input_tokens_seen": 430801344, "step": 423100 }, { "epoch": 8.332184835896124, "grad_norm": 2.0005760192871094, "learning_rate": 4.133033831839549e-07, "loss": 2.0115, "num_input_tokens_seen": 430902008, "step": 423200 }, { "epoch": 8.334153688645626, "grad_norm": 2.054933786392212, "learning_rate": 4.131003331599853e-07, "loss": 1.9885, "num_input_tokens_seen": 431004408, "step": 423300 }, { "epoch": 8.33612254139513, "grad_norm": 1.8333909511566162, "learning_rate": 4.128972979125709e-07, "loss": 1.9816, "num_input_tokens_seen": 431105448, "step": 423400 }, { "epoch": 8.338091394144632, "grad_norm": 2.044424533843994, "learning_rate": 4.126942774762365e-07, "loss": 1.992, "num_input_tokens_seen": 431207848, "step": 423500 }, { "epoch": 8.340060246894135, "grad_norm": 2.124260187149048, "learning_rate": 4.124912718855037e-07, "loss": 1.9986, "num_input_tokens_seen": 431309032, "step": 423600 }, { "epoch": 8.342029099643637, "grad_norm": 2.029766082763672, "learning_rate": 4.1228828117489225e-07, "loss": 1.9755, "num_input_tokens_seen": 431410840, "step": 423700 }, { "epoch": 8.34399795239314, "grad_norm": 1.7196077108383179, "learning_rate": 4.120853053789185e-07, "loss": 1.9747, "num_input_tokens_seen": 431512200, "step": 423800 }, { "epoch": 8.345966805142643, "grad_norm": 1.8024121522903442, "learning_rate": 4.1188234453209735e-07, "loss": 1.9553, "num_input_tokens_seen": 431614600, "step": 423900 }, { "epoch": 8.347935657892146, "grad_norm": 1.8844993114471436, "learning_rate": 4.1167939866894e-07, "loss": 1.9603, "num_input_tokens_seen": 431716448, "step": 424000 }, { "epoch": 8.349904510641649, "grad_norm": 2.231053590774536, "learning_rate": 4.1147646782395614e-07, "loss": 1.9783, "num_input_tokens_seen": 431817920, "step": 424100 }, { "epoch": 8.351873363391151, "grad_norm": 2.028012990951538, "learning_rate": 4.112735520316523e-07, "loss": 1.9808, "num_input_tokens_seen": 431919736, "step": 424200 }, { "epoch": 8.353842216140654, "grad_norm": 1.9216039180755615, "learning_rate": 4.110706513265325e-07, "loss": 1.9357, "num_input_tokens_seen": 432020928, "step": 424300 }, { "epoch": 8.355811068890159, "grad_norm": 1.8875792026519775, "learning_rate": 4.108677657430986e-07, "loss": 1.9918, "num_input_tokens_seen": 432121480, "step": 424400 }, { "epoch": 8.357779921639661, "grad_norm": 2.23197340965271, "learning_rate": 4.106648953158491e-07, "loss": 1.9791, "num_input_tokens_seen": 432223008, "step": 424500 }, { "epoch": 8.359748774389164, "grad_norm": 1.9889016151428223, "learning_rate": 4.1046204007928083e-07, "loss": 1.9955, "num_input_tokens_seen": 432324496, "step": 424600 }, { "epoch": 8.361717627138667, "grad_norm": 1.8320786952972412, "learning_rate": 4.102592000678873e-07, "loss": 1.9879, "num_input_tokens_seen": 432426896, "step": 424700 }, { "epoch": 8.36368647988817, "grad_norm": 1.7026339769363403, "learning_rate": 4.1005637531616006e-07, "loss": 1.9809, "num_input_tokens_seen": 432529208, "step": 424800 }, { "epoch": 8.365655332637672, "grad_norm": 1.7036831378936768, "learning_rate": 4.098535658585874e-07, "loss": 1.9566, "num_input_tokens_seen": 432631608, "step": 424900 }, { "epoch": 8.367624185387175, "grad_norm": 1.6798295974731445, "learning_rate": 4.096507717296557e-07, "loss": 2.0003, "num_input_tokens_seen": 432733384, "step": 425000 }, { "epoch": 8.369593038136678, "grad_norm": 1.8298149108886719, "learning_rate": 4.0944799296384813e-07, "loss": 1.9813, "num_input_tokens_seen": 432835784, "step": 425100 }, { "epoch": 8.37156189088618, "grad_norm": 3.7572927474975586, "learning_rate": 4.092452295956456e-07, "loss": 2.0267, "num_input_tokens_seen": 432936896, "step": 425200 }, { "epoch": 8.373530743635683, "grad_norm": 1.8785380125045776, "learning_rate": 4.090424816595264e-07, "loss": 1.9489, "num_input_tokens_seen": 433038616, "step": 425300 }, { "epoch": 8.375499596385186, "grad_norm": 1.6943247318267822, "learning_rate": 4.0883974918996587e-07, "loss": 1.9718, "num_input_tokens_seen": 433141016, "step": 425400 }, { "epoch": 8.377468449134689, "grad_norm": 1.8722991943359375, "learning_rate": 4.0863703222143725e-07, "loss": 1.9749, "num_input_tokens_seen": 433242080, "step": 425500 }, { "epoch": 8.379437301884192, "grad_norm": 1.8121262788772583, "learning_rate": 4.084343307884106e-07, "loss": 1.9709, "num_input_tokens_seen": 433343856, "step": 425600 }, { "epoch": 8.381406154633694, "grad_norm": 1.9046111106872559, "learning_rate": 4.0823164492535383e-07, "loss": 2.0071, "num_input_tokens_seen": 433446256, "step": 425700 }, { "epoch": 8.383375007383197, "grad_norm": 1.6672890186309814, "learning_rate": 4.080289746667317e-07, "loss": 1.9643, "num_input_tokens_seen": 433548656, "step": 425800 }, { "epoch": 8.3853438601327, "grad_norm": 1.9436790943145752, "learning_rate": 4.0782632004700685e-07, "loss": 2.0125, "num_input_tokens_seen": 433649784, "step": 425900 }, { "epoch": 8.387312712882203, "grad_norm": 1.7911956310272217, "learning_rate": 4.0762368110063874e-07, "loss": 1.9535, "num_input_tokens_seen": 433751712, "step": 426000 }, { "epoch": 8.389281565631707, "grad_norm": 2.1295416355133057, "learning_rate": 4.0742105786208456e-07, "loss": 1.9249, "num_input_tokens_seen": 433854112, "step": 426100 }, { "epoch": 8.39125041838121, "grad_norm": 1.8713308572769165, "learning_rate": 4.072184503657986e-07, "loss": 1.9851, "num_input_tokens_seen": 433955736, "step": 426200 }, { "epoch": 8.393219271130713, "grad_norm": 1.8352092504501343, "learning_rate": 4.070158586462325e-07, "loss": 2.0063, "num_input_tokens_seen": 434057696, "step": 426300 }, { "epoch": 8.395188123880216, "grad_norm": 1.8988292217254639, "learning_rate": 4.0681328273783565e-07, "loss": 2.0578, "num_input_tokens_seen": 434159568, "step": 426400 }, { "epoch": 8.397156976629718, "grad_norm": 1.7122265100479126, "learning_rate": 4.066107226750538e-07, "loss": 1.9879, "num_input_tokens_seen": 434261768, "step": 426500 }, { "epoch": 8.399125829379221, "grad_norm": 1.8604533672332764, "learning_rate": 4.0640817849233124e-07, "loss": 2.0116, "num_input_tokens_seen": 434363464, "step": 426600 }, { "epoch": 8.401094682128724, "grad_norm": 1.8360460996627808, "learning_rate": 4.062056502241081e-07, "loss": 2.0201, "num_input_tokens_seen": 434465864, "step": 426700 }, { "epoch": 8.403063534878227, "grad_norm": 2.057734727859497, "learning_rate": 4.0600313790482347e-07, "loss": 2.0158, "num_input_tokens_seen": 434567000, "step": 426800 }, { "epoch": 8.40503238762773, "grad_norm": 3.415806770324707, "learning_rate": 4.058006415689121e-07, "loss": 2.0208, "num_input_tokens_seen": 434668168, "step": 426900 }, { "epoch": 8.407001240377232, "grad_norm": 23.310867309570312, "learning_rate": 4.0559816125080733e-07, "loss": 2.0415, "num_input_tokens_seen": 434769040, "step": 427000 }, { "epoch": 8.408970093126735, "grad_norm": 2.0601069927215576, "learning_rate": 4.053956969849389e-07, "loss": 1.9822, "num_input_tokens_seen": 434870896, "step": 427100 }, { "epoch": 8.410938945876238, "grad_norm": 1.8918333053588867, "learning_rate": 4.0519324880573423e-07, "loss": 1.9624, "num_input_tokens_seen": 434972520, "step": 427200 }, { "epoch": 8.41290779862574, "grad_norm": 2.754359722137451, "learning_rate": 4.049908167476181e-07, "loss": 1.9915, "num_input_tokens_seen": 435074920, "step": 427300 }, { "epoch": 8.414876651375243, "grad_norm": 2.1084115505218506, "learning_rate": 4.0478840084501207e-07, "loss": 1.9835, "num_input_tokens_seen": 435177320, "step": 427400 }, { "epoch": 8.416845504124746, "grad_norm": 1.9508016109466553, "learning_rate": 4.0458600113233566e-07, "loss": 1.9841, "num_input_tokens_seen": 435279088, "step": 427500 }, { "epoch": 8.418814356874249, "grad_norm": 1.9915024042129517, "learning_rate": 4.043836176440048e-07, "loss": 1.9733, "num_input_tokens_seen": 435381488, "step": 427600 }, { "epoch": 8.420783209623753, "grad_norm": 1.9296541213989258, "learning_rate": 4.041812504144334e-07, "loss": 2.0083, "num_input_tokens_seen": 435483144, "step": 427700 }, { "epoch": 8.422752062373256, "grad_norm": 2.1648142337799072, "learning_rate": 4.039788994780321e-07, "loss": 1.9969, "num_input_tokens_seen": 435584784, "step": 427800 }, { "epoch": 8.424720915122759, "grad_norm": 2.1455576419830322, "learning_rate": 4.037765648692092e-07, "loss": 1.9947, "num_input_tokens_seen": 435685320, "step": 427900 }, { "epoch": 8.426689767872261, "grad_norm": 1.8843876123428345, "learning_rate": 4.0357424662236974e-07, "loss": 2.0373, "num_input_tokens_seen": 435785712, "step": 428000 }, { "epoch": 8.428658620621764, "grad_norm": 1.9977744817733765, "learning_rate": 4.0337194477191643e-07, "loss": 2.0173, "num_input_tokens_seen": 435886168, "step": 428100 }, { "epoch": 8.430627473371267, "grad_norm": 1.967344045639038, "learning_rate": 4.031696593522488e-07, "loss": 1.9716, "num_input_tokens_seen": 435988568, "step": 428200 }, { "epoch": 8.43259632612077, "grad_norm": 1.6700453758239746, "learning_rate": 4.0296739039776394e-07, "loss": 1.9326, "num_input_tokens_seen": 436090968, "step": 428300 }, { "epoch": 8.434565178870272, "grad_norm": 1.8812015056610107, "learning_rate": 4.02765137942856e-07, "loss": 1.9479, "num_input_tokens_seen": 436191576, "step": 428400 }, { "epoch": 8.436534031619775, "grad_norm": 1.8461774587631226, "learning_rate": 4.0256290202191615e-07, "loss": 2.0043, "num_input_tokens_seen": 436293976, "step": 428500 }, { "epoch": 8.438502884369278, "grad_norm": 2.5795412063598633, "learning_rate": 4.023606826693331e-07, "loss": 1.9447, "num_input_tokens_seen": 436396376, "step": 428600 }, { "epoch": 8.44047173711878, "grad_norm": 2.1136581897735596, "learning_rate": 4.0215847991949226e-07, "loss": 1.969, "num_input_tokens_seen": 436498776, "step": 428700 }, { "epoch": 8.442440589868283, "grad_norm": 2.1589603424072266, "learning_rate": 4.0195629380677695e-07, "loss": 1.9825, "num_input_tokens_seen": 436599592, "step": 428800 }, { "epoch": 8.444409442617786, "grad_norm": 1.8121565580368042, "learning_rate": 4.017541243655666e-07, "loss": 2.0007, "num_input_tokens_seen": 436701992, "step": 428900 }, { "epoch": 8.446378295367289, "grad_norm": 2.016279935836792, "learning_rate": 4.015519716302391e-07, "loss": 1.9881, "num_input_tokens_seen": 436804392, "step": 429000 }, { "epoch": 8.448347148116792, "grad_norm": 2.1842615604400635, "learning_rate": 4.013498356351681e-07, "loss": 1.9946, "num_input_tokens_seen": 436905456, "step": 429100 }, { "epoch": 8.450316000866295, "grad_norm": 1.8275010585784912, "learning_rate": 4.011477164147257e-07, "loss": 1.9784, "num_input_tokens_seen": 437006192, "step": 429200 }, { "epoch": 8.452284853615797, "grad_norm": 1.9177550077438354, "learning_rate": 4.009456140032804e-07, "loss": 1.9949, "num_input_tokens_seen": 437108592, "step": 429300 }, { "epoch": 8.4542537063653, "grad_norm": 2.019090414047241, "learning_rate": 4.0074352843519787e-07, "loss": 2.0295, "num_input_tokens_seen": 437208776, "step": 429400 }, { "epoch": 8.456222559114805, "grad_norm": 1.9815258979797363, "learning_rate": 4.0054145974484134e-07, "loss": 1.9856, "num_input_tokens_seen": 437309936, "step": 429500 }, { "epoch": 8.458191411864307, "grad_norm": 1.8791011571884155, "learning_rate": 4.003394079665705e-07, "loss": 1.9485, "num_input_tokens_seen": 437412104, "step": 429600 }, { "epoch": 8.46016026461381, "grad_norm": 1.783757209777832, "learning_rate": 4.0013737313474297e-07, "loss": 2.0115, "num_input_tokens_seen": 437513832, "step": 429700 }, { "epoch": 8.462129117363313, "grad_norm": 1.8619985580444336, "learning_rate": 3.999353552837127e-07, "loss": 1.9659, "num_input_tokens_seen": 437615416, "step": 429800 }, { "epoch": 8.464097970112816, "grad_norm": 1.8177839517593384, "learning_rate": 3.997333544478314e-07, "loss": 1.9783, "num_input_tokens_seen": 437717816, "step": 429900 }, { "epoch": 8.466066822862318, "grad_norm": 2.1941258907318115, "learning_rate": 3.9953137066144736e-07, "loss": 1.9966, "num_input_tokens_seen": 437820216, "step": 430000 }, { "epoch": 8.468035675611821, "grad_norm": 1.86483895778656, "learning_rate": 3.993294039589065e-07, "loss": 1.9953, "num_input_tokens_seen": 437921048, "step": 430100 }, { "epoch": 8.470004528361324, "grad_norm": 1.8158870935440063, "learning_rate": 3.991274543745512e-07, "loss": 1.9526, "num_input_tokens_seen": 438023448, "step": 430200 }, { "epoch": 8.471973381110827, "grad_norm": 1.7289897203445435, "learning_rate": 3.9892552194272156e-07, "loss": 1.9768, "num_input_tokens_seen": 438125848, "step": 430300 }, { "epoch": 8.47394223386033, "grad_norm": 1.8524589538574219, "learning_rate": 3.987236066977545e-07, "loss": 2.0123, "num_input_tokens_seen": 438227656, "step": 430400 }, { "epoch": 8.475911086609832, "grad_norm": 1.8160289525985718, "learning_rate": 3.9852170867398385e-07, "loss": 2.0069, "num_input_tokens_seen": 438329464, "step": 430500 }, { "epoch": 8.477879939359335, "grad_norm": 1.9283607006072998, "learning_rate": 3.9831982790574074e-07, "loss": 1.9812, "num_input_tokens_seen": 438430968, "step": 430600 }, { "epoch": 8.479848792108838, "grad_norm": 6.773560047149658, "learning_rate": 3.9811796442735326e-07, "loss": 1.9583, "num_input_tokens_seen": 438533368, "step": 430700 }, { "epoch": 8.48181764485834, "grad_norm": 1.9884876012802124, "learning_rate": 3.979161182731467e-07, "loss": 2.0006, "num_input_tokens_seen": 438635328, "step": 430800 }, { "epoch": 8.483786497607843, "grad_norm": 2.183328628540039, "learning_rate": 3.977142894774431e-07, "loss": 2.0123, "num_input_tokens_seen": 438737728, "step": 430900 }, { "epoch": 8.485755350357346, "grad_norm": 1.9174261093139648, "learning_rate": 3.97512478074562e-07, "loss": 1.9989, "num_input_tokens_seen": 438840128, "step": 431000 }, { "epoch": 8.48772420310685, "grad_norm": 2.13865065574646, "learning_rate": 3.973106840988194e-07, "loss": 1.9562, "num_input_tokens_seen": 438942184, "step": 431100 }, { "epoch": 8.489693055856353, "grad_norm": 1.809452772140503, "learning_rate": 3.97108907584529e-07, "loss": 1.9784, "num_input_tokens_seen": 439044584, "step": 431200 }, { "epoch": 8.491661908605856, "grad_norm": 2.4269907474517822, "learning_rate": 3.969071485660009e-07, "loss": 1.9849, "num_input_tokens_seen": 439146984, "step": 431300 }, { "epoch": 8.493630761355359, "grad_norm": 1.9628105163574219, "learning_rate": 3.967054070775426e-07, "loss": 2.0059, "num_input_tokens_seen": 439249384, "step": 431400 }, { "epoch": 8.495599614104862, "grad_norm": 2.2871105670928955, "learning_rate": 3.965036831534589e-07, "loss": 1.9808, "num_input_tokens_seen": 439351784, "step": 431500 }, { "epoch": 8.497568466854364, "grad_norm": 1.9083971977233887, "learning_rate": 3.963019768280507e-07, "loss": 1.9865, "num_input_tokens_seen": 439454184, "step": 431600 }, { "epoch": 8.499537319603867, "grad_norm": 2.2168989181518555, "learning_rate": 3.9610028813561696e-07, "loss": 1.9583, "num_input_tokens_seen": 439556088, "step": 431700 }, { "epoch": 8.50150617235337, "grad_norm": 2.5334222316741943, "learning_rate": 3.9589861711045257e-07, "loss": 1.9762, "num_input_tokens_seen": 439657728, "step": 431800 }, { "epoch": 8.503475025102873, "grad_norm": 1.8706711530685425, "learning_rate": 3.956969637868507e-07, "loss": 1.9874, "num_input_tokens_seen": 439760128, "step": 431900 }, { "epoch": 8.505443877852375, "grad_norm": 2.2788848876953125, "learning_rate": 3.954953281991e-07, "loss": 1.9648, "num_input_tokens_seen": 439862528, "step": 432000 }, { "epoch": 8.507412730601878, "grad_norm": 2.0718917846679688, "learning_rate": 3.9529371038148765e-07, "loss": 1.9935, "num_input_tokens_seen": 439964928, "step": 432100 }, { "epoch": 8.50938158335138, "grad_norm": 2.064072847366333, "learning_rate": 3.9509211036829644e-07, "loss": 1.9885, "num_input_tokens_seen": 440067328, "step": 432200 }, { "epoch": 8.511350436100884, "grad_norm": 1.813452124595642, "learning_rate": 3.948905281938071e-07, "loss": 1.9709, "num_input_tokens_seen": 440168432, "step": 432300 }, { "epoch": 8.513319288850386, "grad_norm": 1.9435439109802246, "learning_rate": 3.94688963892297e-07, "loss": 2.0214, "num_input_tokens_seen": 440270032, "step": 432400 }, { "epoch": 8.51528814159989, "grad_norm": 1.7269937992095947, "learning_rate": 3.9448741749804027e-07, "loss": 1.9927, "num_input_tokens_seen": 440371184, "step": 432500 }, { "epoch": 8.517256994349392, "grad_norm": 1.9344353675842285, "learning_rate": 3.9428588904530836e-07, "loss": 2.0034, "num_input_tokens_seen": 440473584, "step": 432600 }, { "epoch": 8.519225847098895, "grad_norm": 2.3893494606018066, "learning_rate": 3.940843785683692e-07, "loss": 1.9497, "num_input_tokens_seen": 440575384, "step": 432700 }, { "epoch": 8.5211946998484, "grad_norm": 2.681697130203247, "learning_rate": 3.9388288610148833e-07, "loss": 1.9514, "num_input_tokens_seen": 440677712, "step": 432800 }, { "epoch": 8.523163552597902, "grad_norm": 1.8612326383590698, "learning_rate": 3.9368141167892756e-07, "loss": 1.9723, "num_input_tokens_seen": 440779680, "step": 432900 }, { "epoch": 8.525132405347405, "grad_norm": 1.8434288501739502, "learning_rate": 3.9347995533494605e-07, "loss": 1.9724, "num_input_tokens_seen": 440880920, "step": 433000 }, { "epoch": 8.527101258096907, "grad_norm": 2.232220411300659, "learning_rate": 3.932785171037996e-07, "loss": 1.9915, "num_input_tokens_seen": 440983320, "step": 433100 }, { "epoch": 8.52907011084641, "grad_norm": 2.323387384414673, "learning_rate": 3.9307709701974135e-07, "loss": 1.9952, "num_input_tokens_seen": 441082520, "step": 433200 }, { "epoch": 8.531038963595913, "grad_norm": 1.8952317237854004, "learning_rate": 3.928756951170208e-07, "loss": 1.9617, "num_input_tokens_seen": 441184920, "step": 433300 }, { "epoch": 8.533007816345416, "grad_norm": 1.9474390745162964, "learning_rate": 3.9267431142988473e-07, "loss": 1.9576, "num_input_tokens_seen": 441286760, "step": 433400 }, { "epoch": 8.534976669094918, "grad_norm": 2.042001485824585, "learning_rate": 3.9247294599257683e-07, "loss": 1.996, "num_input_tokens_seen": 441389160, "step": 433500 }, { "epoch": 8.536945521844421, "grad_norm": 1.6450037956237793, "learning_rate": 3.922715988393375e-07, "loss": 1.9503, "num_input_tokens_seen": 441491560, "step": 433600 }, { "epoch": 8.538914374593924, "grad_norm": 1.9197132587432861, "learning_rate": 3.9207027000440427e-07, "loss": 1.9922, "num_input_tokens_seen": 441593128, "step": 433700 }, { "epoch": 8.540883227343427, "grad_norm": 2.1575756072998047, "learning_rate": 3.918689595220112e-07, "loss": 1.9514, "num_input_tokens_seen": 441694936, "step": 433800 }, { "epoch": 8.54285208009293, "grad_norm": 2.118551015853882, "learning_rate": 3.916676674263897e-07, "loss": 1.9915, "num_input_tokens_seen": 441797336, "step": 433900 }, { "epoch": 8.544820932842432, "grad_norm": 2.099083185195923, "learning_rate": 3.914663937517674e-07, "loss": 2.0036, "num_input_tokens_seen": 441899256, "step": 434000 }, { "epoch": 8.546789785591935, "grad_norm": 1.927480697631836, "learning_rate": 3.9126513853236966e-07, "loss": 1.9632, "num_input_tokens_seen": 442001088, "step": 434100 }, { "epoch": 8.548758638341438, "grad_norm": 2.0223116874694824, "learning_rate": 3.9106390180241784e-07, "loss": 1.9731, "num_input_tokens_seen": 442102624, "step": 434200 }, { "epoch": 8.55072749109094, "grad_norm": 2.0387868881225586, "learning_rate": 3.908626835961306e-07, "loss": 2.0017, "num_input_tokens_seen": 442204096, "step": 434300 }, { "epoch": 8.552696343840445, "grad_norm": 1.9302129745483398, "learning_rate": 3.906614839477238e-07, "loss": 1.9831, "num_input_tokens_seen": 442304768, "step": 434400 }, { "epoch": 8.554665196589948, "grad_norm": 2.205199718475342, "learning_rate": 3.9046030289140916e-07, "loss": 2.0015, "num_input_tokens_seen": 442407168, "step": 434500 }, { "epoch": 8.55663404933945, "grad_norm": 1.7624105215072632, "learning_rate": 3.902591404613965e-07, "loss": 2.0227, "num_input_tokens_seen": 442507768, "step": 434600 }, { "epoch": 8.558602902088953, "grad_norm": 6.508541584014893, "learning_rate": 3.90057996691891e-07, "loss": 2.0247, "num_input_tokens_seen": 442608272, "step": 434700 }, { "epoch": 8.560571754838456, "grad_norm": 1.8274939060211182, "learning_rate": 3.898568716170962e-07, "loss": 1.963, "num_input_tokens_seen": 442710216, "step": 434800 }, { "epoch": 8.562540607587959, "grad_norm": 2.1489410400390625, "learning_rate": 3.8965576527121106e-07, "loss": 1.9748, "num_input_tokens_seen": 442811976, "step": 434900 }, { "epoch": 8.564509460337462, "grad_norm": 2.253645896911621, "learning_rate": 3.894546776884327e-07, "loss": 2.0141, "num_input_tokens_seen": 442913288, "step": 435000 }, { "epoch": 8.566478313086964, "grad_norm": 1.892430067062378, "learning_rate": 3.892536089029537e-07, "loss": 1.9889, "num_input_tokens_seen": 443015688, "step": 435100 }, { "epoch": 8.568447165836467, "grad_norm": 1.7716001272201538, "learning_rate": 3.8905255894896475e-07, "loss": 1.9583, "num_input_tokens_seen": 443118088, "step": 435200 }, { "epoch": 8.57041601858597, "grad_norm": 2.080090045928955, "learning_rate": 3.8885152786065204e-07, "loss": 2.046, "num_input_tokens_seen": 443219720, "step": 435300 }, { "epoch": 8.572384871335473, "grad_norm": 2.0503625869750977, "learning_rate": 3.886505156721996e-07, "loss": 1.9642, "num_input_tokens_seen": 443322120, "step": 435400 }, { "epoch": 8.574353724084975, "grad_norm": 1.9515348672866821, "learning_rate": 3.8844952241778807e-07, "loss": 1.9432, "num_input_tokens_seen": 443424432, "step": 435500 }, { "epoch": 8.576322576834478, "grad_norm": 1.940108060836792, "learning_rate": 3.882485481315941e-07, "loss": 1.9411, "num_input_tokens_seen": 443526096, "step": 435600 }, { "epoch": 8.578291429583981, "grad_norm": 1.890031337738037, "learning_rate": 3.880475928477922e-07, "loss": 1.964, "num_input_tokens_seen": 443628200, "step": 435700 }, { "epoch": 8.580260282333484, "grad_norm": 1.9414799213409424, "learning_rate": 3.878466566005528e-07, "loss": 1.9801, "num_input_tokens_seen": 443730600, "step": 435800 }, { "epoch": 8.582229135082986, "grad_norm": 1.7501819133758545, "learning_rate": 3.876457394240435e-07, "loss": 1.9669, "num_input_tokens_seen": 443832128, "step": 435900 }, { "epoch": 8.58419798783249, "grad_norm": 1.8722598552703857, "learning_rate": 3.874448413524285e-07, "loss": 1.978, "num_input_tokens_seen": 443932944, "step": 436000 }, { "epoch": 8.586166840581992, "grad_norm": 1.9408739805221558, "learning_rate": 3.87243962419869e-07, "loss": 2.0172, "num_input_tokens_seen": 444035344, "step": 436100 }, { "epoch": 8.588135693331497, "grad_norm": 1.9807482957839966, "learning_rate": 3.8704310266052257e-07, "loss": 1.9877, "num_input_tokens_seen": 444137744, "step": 436200 }, { "epoch": 8.590104546081, "grad_norm": 1.986048698425293, "learning_rate": 3.868422621085439e-07, "loss": 1.9757, "num_input_tokens_seen": 444240144, "step": 436300 }, { "epoch": 8.592073398830502, "grad_norm": 3.6090807914733887, "learning_rate": 3.866414407980841e-07, "loss": 1.9651, "num_input_tokens_seen": 444342008, "step": 436400 }, { "epoch": 8.594042251580005, "grad_norm": 1.8517863750457764, "learning_rate": 3.8644063876329114e-07, "loss": 1.9837, "num_input_tokens_seen": 444442616, "step": 436500 }, { "epoch": 8.596011104329508, "grad_norm": 1.7607098817825317, "learning_rate": 3.862398560383098e-07, "loss": 1.9728, "num_input_tokens_seen": 444543928, "step": 436600 }, { "epoch": 8.59797995707901, "grad_norm": 2.03020977973938, "learning_rate": 3.860390926572814e-07, "loss": 1.9836, "num_input_tokens_seen": 444646328, "step": 436700 }, { "epoch": 8.599948809828513, "grad_norm": 1.9390146732330322, "learning_rate": 3.858383486543442e-07, "loss": 1.9898, "num_input_tokens_seen": 444748176, "step": 436800 }, { "epoch": 8.601917662578016, "grad_norm": 1.862971544265747, "learning_rate": 3.8563762406363284e-07, "loss": 1.9241, "num_input_tokens_seen": 444849872, "step": 436900 }, { "epoch": 8.603886515327519, "grad_norm": 2.3055028915405273, "learning_rate": 3.8543691891927896e-07, "loss": 2.0357, "num_input_tokens_seen": 444950808, "step": 437000 }, { "epoch": 8.605855368077021, "grad_norm": 2.1133251190185547, "learning_rate": 3.852362332554107e-07, "loss": 2.007, "num_input_tokens_seen": 445053208, "step": 437100 }, { "epoch": 8.607824220826524, "grad_norm": 2.1245574951171875, "learning_rate": 3.8503556710615316e-07, "loss": 1.9548, "num_input_tokens_seen": 445155040, "step": 437200 }, { "epoch": 8.609793073576027, "grad_norm": 1.8872050046920776, "learning_rate": 3.8483492050562747e-07, "loss": 1.9922, "num_input_tokens_seen": 445256480, "step": 437300 }, { "epoch": 8.61176192632553, "grad_norm": 1.849798321723938, "learning_rate": 3.846342934879524e-07, "loss": 1.9615, "num_input_tokens_seen": 445358880, "step": 437400 }, { "epoch": 8.613730779075032, "grad_norm": 2.1888458728790283, "learning_rate": 3.8443368608724274e-07, "loss": 2.022, "num_input_tokens_seen": 445460488, "step": 437500 }, { "epoch": 8.615699631824535, "grad_norm": 1.822311282157898, "learning_rate": 3.8423309833761e-07, "loss": 1.9643, "num_input_tokens_seen": 445562888, "step": 437600 }, { "epoch": 8.617668484574038, "grad_norm": 1.8156375885009766, "learning_rate": 3.8403253027316246e-07, "loss": 1.9618, "num_input_tokens_seen": 445664008, "step": 437700 }, { "epoch": 8.619637337323542, "grad_norm": 1.8686310052871704, "learning_rate": 3.8383198192800496e-07, "loss": 2.0324, "num_input_tokens_seen": 445766408, "step": 437800 }, { "epoch": 8.621606190073045, "grad_norm": 2.314169406890869, "learning_rate": 3.8363145333623924e-07, "loss": 1.9561, "num_input_tokens_seen": 445868808, "step": 437900 }, { "epoch": 8.623575042822548, "grad_norm": 2.126976490020752, "learning_rate": 3.834309445319633e-07, "loss": 1.9656, "num_input_tokens_seen": 445971208, "step": 438000 }, { "epoch": 8.62554389557205, "grad_norm": 4.256861209869385, "learning_rate": 3.8323045554927215e-07, "loss": 1.9902, "num_input_tokens_seen": 446072184, "step": 438100 }, { "epoch": 8.627512748321553, "grad_norm": 2.1243627071380615, "learning_rate": 3.8302998642225705e-07, "loss": 1.982, "num_input_tokens_seen": 446174584, "step": 438200 }, { "epoch": 8.629481601071056, "grad_norm": 4.225976467132568, "learning_rate": 3.828295371850062e-07, "loss": 1.9624, "num_input_tokens_seen": 446276984, "step": 438300 }, { "epoch": 8.631450453820559, "grad_norm": 2.3296940326690674, "learning_rate": 3.826291078716043e-07, "loss": 1.9954, "num_input_tokens_seen": 446378328, "step": 438400 }, { "epoch": 8.633419306570062, "grad_norm": 3.5202696323394775, "learning_rate": 3.8242869851613265e-07, "loss": 1.9833, "num_input_tokens_seen": 446479984, "step": 438500 }, { "epoch": 8.635388159319564, "grad_norm": 1.9449868202209473, "learning_rate": 3.8222830915266923e-07, "loss": 1.9779, "num_input_tokens_seen": 446582384, "step": 438600 }, { "epoch": 8.637357012069067, "grad_norm": 1.9325412511825562, "learning_rate": 3.820279398152884e-07, "loss": 2.0006, "num_input_tokens_seen": 446684784, "step": 438700 }, { "epoch": 8.63932586481857, "grad_norm": 2.032569169998169, "learning_rate": 3.818275905380615e-07, "loss": 2.0278, "num_input_tokens_seen": 446787184, "step": 438800 }, { "epoch": 8.641294717568073, "grad_norm": 2.101090908050537, "learning_rate": 3.8162726135505607e-07, "loss": 1.9513, "num_input_tokens_seen": 446889584, "step": 438900 }, { "epoch": 8.643263570317576, "grad_norm": 2.1767234802246094, "learning_rate": 3.814269523003365e-07, "loss": 1.938, "num_input_tokens_seen": 446991984, "step": 439000 }, { "epoch": 8.645232423067078, "grad_norm": 1.8542765378952026, "learning_rate": 3.812266634079635e-07, "loss": 1.9841, "num_input_tokens_seen": 447094384, "step": 439100 }, { "epoch": 8.647201275816581, "grad_norm": 1.9887007474899292, "learning_rate": 3.8102639471199473e-07, "loss": 1.9702, "num_input_tokens_seen": 447196160, "step": 439200 }, { "epoch": 8.649170128566084, "grad_norm": 1.7489919662475586, "learning_rate": 3.808261462464839e-07, "loss": 2.1175, "num_input_tokens_seen": 447297608, "step": 439300 }, { "epoch": 8.651138981315587, "grad_norm": 1.914811372756958, "learning_rate": 3.8062591804548165e-07, "loss": 1.9412, "num_input_tokens_seen": 447400008, "step": 439400 }, { "epoch": 8.653107834065091, "grad_norm": 1.98484206199646, "learning_rate": 3.804257101430355e-07, "loss": 2.0423, "num_input_tokens_seen": 447502024, "step": 439500 }, { "epoch": 8.655076686814594, "grad_norm": 6.037850379943848, "learning_rate": 3.802255225731886e-07, "loss": 2.0018, "num_input_tokens_seen": 447604424, "step": 439600 }, { "epoch": 8.657045539564097, "grad_norm": 2.195216178894043, "learning_rate": 3.800253553699816e-07, "loss": 1.998, "num_input_tokens_seen": 447706824, "step": 439700 }, { "epoch": 8.6590143923136, "grad_norm": 2.026824951171875, "learning_rate": 3.7982520856745063e-07, "loss": 2.011, "num_input_tokens_seen": 447808600, "step": 439800 }, { "epoch": 8.660983245063102, "grad_norm": 1.842523217201233, "learning_rate": 3.7962508219962975e-07, "loss": 1.9868, "num_input_tokens_seen": 447910184, "step": 439900 }, { "epoch": 8.662952097812605, "grad_norm": 2.0842556953430176, "learning_rate": 3.7942497630054803e-07, "loss": 2.0129, "num_input_tokens_seen": 448012584, "step": 440000 }, { "epoch": 8.664920950562108, "grad_norm": 1.891471028327942, "learning_rate": 3.792248909042324e-07, "loss": 2.0276, "num_input_tokens_seen": 448114984, "step": 440100 }, { "epoch": 8.66688980331161, "grad_norm": 1.9226154088974, "learning_rate": 3.790248260447051e-07, "loss": 2.033, "num_input_tokens_seen": 448216312, "step": 440200 }, { "epoch": 8.668858656061113, "grad_norm": 1.829014778137207, "learning_rate": 3.788247817559861e-07, "loss": 1.9778, "num_input_tokens_seen": 448317328, "step": 440300 }, { "epoch": 8.670827508810616, "grad_norm": 1.9798763990402222, "learning_rate": 3.786247580720906e-07, "loss": 1.9561, "num_input_tokens_seen": 448418960, "step": 440400 }, { "epoch": 8.672796361560119, "grad_norm": 2.055565118789673, "learning_rate": 3.784247550270313e-07, "loss": 1.9836, "num_input_tokens_seen": 448521360, "step": 440500 }, { "epoch": 8.674765214309621, "grad_norm": 1.8370145559310913, "learning_rate": 3.7822477265481725e-07, "loss": 2.0119, "num_input_tokens_seen": 448623760, "step": 440600 }, { "epoch": 8.676734067059124, "grad_norm": 2.077500343322754, "learning_rate": 3.7802481098945325e-07, "loss": 1.9884, "num_input_tokens_seen": 448724848, "step": 440700 }, { "epoch": 8.678702919808627, "grad_norm": 4.799295425415039, "learning_rate": 3.7782487006494155e-07, "loss": 1.943, "num_input_tokens_seen": 448827248, "step": 440800 }, { "epoch": 8.68067177255813, "grad_norm": 2.258174180984497, "learning_rate": 3.776249499152801e-07, "loss": 2.0372, "num_input_tokens_seen": 448928944, "step": 440900 }, { "epoch": 8.682640625307632, "grad_norm": 2.02398419380188, "learning_rate": 3.774250505744639e-07, "loss": 1.9563, "num_input_tokens_seen": 449030248, "step": 441000 }, { "epoch": 8.684609478057137, "grad_norm": 2.144974946975708, "learning_rate": 3.7722517207648385e-07, "loss": 2.0016, "num_input_tokens_seen": 449131240, "step": 441100 }, { "epoch": 8.68657833080664, "grad_norm": 1.9988431930541992, "learning_rate": 3.770253144553279e-07, "loss": 2.0404, "num_input_tokens_seen": 449233192, "step": 441200 }, { "epoch": 8.688547183556143, "grad_norm": 1.8716778755187988, "learning_rate": 3.7682547774497985e-07, "loss": 2.0045, "num_input_tokens_seen": 449335592, "step": 441300 }, { "epoch": 8.690516036305645, "grad_norm": 1.728929877281189, "learning_rate": 3.766256619794205e-07, "loss": 1.9987, "num_input_tokens_seen": 449437992, "step": 441400 }, { "epoch": 8.692484889055148, "grad_norm": 1.7776018381118774, "learning_rate": 3.764258671926267e-07, "loss": 2.012, "num_input_tokens_seen": 449538968, "step": 441500 }, { "epoch": 8.69445374180465, "grad_norm": 2.1455764770507812, "learning_rate": 3.762260934185719e-07, "loss": 1.9857, "num_input_tokens_seen": 449640816, "step": 441600 }, { "epoch": 8.696422594554154, "grad_norm": 1.8637111186981201, "learning_rate": 3.760263406912261e-07, "loss": 1.9827, "num_input_tokens_seen": 449742352, "step": 441700 }, { "epoch": 8.698391447303656, "grad_norm": 1.8431859016418457, "learning_rate": 3.758266090445553e-07, "loss": 1.9565, "num_input_tokens_seen": 449844104, "step": 441800 }, { "epoch": 8.700360300053159, "grad_norm": 2.1522130966186523, "learning_rate": 3.7562689851252233e-07, "loss": 1.9257, "num_input_tokens_seen": 449946504, "step": 441900 }, { "epoch": 8.702329152802662, "grad_norm": 1.9919891357421875, "learning_rate": 3.754272091290862e-07, "loss": 1.9943, "num_input_tokens_seen": 450047648, "step": 442000 }, { "epoch": 8.704298005552165, "grad_norm": 1.9764221906661987, "learning_rate": 3.7522754092820256e-07, "loss": 1.9611, "num_input_tokens_seen": 450149208, "step": 442100 }, { "epoch": 8.706266858301667, "grad_norm": 2.01125431060791, "learning_rate": 3.7502789394382316e-07, "loss": 1.9854, "num_input_tokens_seen": 450251608, "step": 442200 }, { "epoch": 8.70823571105117, "grad_norm": 1.8752394914627075, "learning_rate": 3.748282682098964e-07, "loss": 1.9714, "num_input_tokens_seen": 450354008, "step": 442300 }, { "epoch": 8.710204563800673, "grad_norm": 2.136979341506958, "learning_rate": 3.7462866376036674e-07, "loss": 1.9278, "num_input_tokens_seen": 450456408, "step": 442400 }, { "epoch": 8.712173416550176, "grad_norm": 1.881626009941101, "learning_rate": 3.7442908062917533e-07, "loss": 1.9564, "num_input_tokens_seen": 450558808, "step": 442500 }, { "epoch": 8.714142269299678, "grad_norm": 1.8991940021514893, "learning_rate": 3.7422951885025986e-07, "loss": 1.9451, "num_input_tokens_seen": 450661208, "step": 442600 }, { "epoch": 8.716111122049181, "grad_norm": 1.857869267463684, "learning_rate": 3.740299784575537e-07, "loss": 2.01, "num_input_tokens_seen": 450763064, "step": 442700 }, { "epoch": 8.718079974798684, "grad_norm": 1.9196866750717163, "learning_rate": 3.7383045948498754e-07, "loss": 1.9864, "num_input_tokens_seen": 450865464, "step": 442800 }, { "epoch": 8.720048827548188, "grad_norm": 1.865019679069519, "learning_rate": 3.7363096196648723e-07, "loss": 1.9767, "num_input_tokens_seen": 450967864, "step": 442900 }, { "epoch": 8.722017680297691, "grad_norm": 2.0263545513153076, "learning_rate": 3.734314859359763e-07, "loss": 1.9688, "num_input_tokens_seen": 451069624, "step": 443000 }, { "epoch": 8.723986533047194, "grad_norm": 1.958918571472168, "learning_rate": 3.7323203142737336e-07, "loss": 1.9957, "num_input_tokens_seen": 451170392, "step": 443100 }, { "epoch": 8.725955385796697, "grad_norm": 1.7758839130401611, "learning_rate": 3.7303259847459457e-07, "loss": 1.9405, "num_input_tokens_seen": 451272792, "step": 443200 }, { "epoch": 8.7279242385462, "grad_norm": 1.9893596172332764, "learning_rate": 3.728331871115512e-07, "loss": 2.0114, "num_input_tokens_seen": 451375192, "step": 443300 }, { "epoch": 8.729893091295702, "grad_norm": 1.9820750951766968, "learning_rate": 3.726337973721521e-07, "loss": 1.9643, "num_input_tokens_seen": 451477040, "step": 443400 }, { "epoch": 8.731861944045205, "grad_norm": 1.8078739643096924, "learning_rate": 3.724344292903012e-07, "loss": 1.957, "num_input_tokens_seen": 451579440, "step": 443500 }, { "epoch": 8.733830796794708, "grad_norm": 1.7033534049987793, "learning_rate": 3.722350828998998e-07, "loss": 1.9845, "num_input_tokens_seen": 451681840, "step": 443600 }, { "epoch": 8.73579964954421, "grad_norm": 2.0560247898101807, "learning_rate": 3.7203575823484497e-07, "loss": 2.0014, "num_input_tokens_seen": 451782536, "step": 443700 }, { "epoch": 8.737768502293713, "grad_norm": 3.4981229305267334, "learning_rate": 3.718364553290301e-07, "loss": 2.0083, "num_input_tokens_seen": 451884344, "step": 443800 }, { "epoch": 8.739737355043216, "grad_norm": 2.064596652984619, "learning_rate": 3.716371742163451e-07, "loss": 1.9539, "num_input_tokens_seen": 451986744, "step": 443900 }, { "epoch": 8.741706207792719, "grad_norm": 1.8347772359848022, "learning_rate": 3.714379149306758e-07, "loss": 2.0239, "num_input_tokens_seen": 452088856, "step": 444000 }, { "epoch": 8.743675060542222, "grad_norm": 2.1077797412872314, "learning_rate": 3.7123867750590493e-07, "loss": 1.9747, "num_input_tokens_seen": 452189624, "step": 444100 }, { "epoch": 8.745643913291724, "grad_norm": 1.8972969055175781, "learning_rate": 3.7103946197591073e-07, "loss": 1.9705, "num_input_tokens_seen": 452291248, "step": 444200 }, { "epoch": 8.747612766041227, "grad_norm": 1.5250269174575806, "learning_rate": 3.708402683745685e-07, "loss": 2.0073, "num_input_tokens_seen": 452393648, "step": 444300 }, { "epoch": 8.74958161879073, "grad_norm": 1.7753756046295166, "learning_rate": 3.7064109673574917e-07, "loss": 2.003, "num_input_tokens_seen": 452494688, "step": 444400 }, { "epoch": 8.751550471540234, "grad_norm": 1.8046605587005615, "learning_rate": 3.704419470933202e-07, "loss": 1.9798, "num_input_tokens_seen": 452596408, "step": 444500 }, { "epoch": 8.753519324289737, "grad_norm": 1.854323387145996, "learning_rate": 3.702428194811455e-07, "loss": 1.9931, "num_input_tokens_seen": 452698808, "step": 444600 }, { "epoch": 8.75548817703924, "grad_norm": 2.2141740322113037, "learning_rate": 3.7004371393308484e-07, "loss": 2.0221, "num_input_tokens_seen": 452799992, "step": 444700 }, { "epoch": 8.757457029788743, "grad_norm": 1.8548610210418701, "learning_rate": 3.6984463048299465e-07, "loss": 2.0111, "num_input_tokens_seen": 452900960, "step": 444800 }, { "epoch": 8.759425882538245, "grad_norm": 2.267636299133301, "learning_rate": 3.696455691647271e-07, "loss": 1.9805, "num_input_tokens_seen": 453002720, "step": 444900 }, { "epoch": 8.761394735287748, "grad_norm": 2.0605995655059814, "learning_rate": 3.694465300121312e-07, "loss": 2.0197, "num_input_tokens_seen": 453104312, "step": 445000 }, { "epoch": 8.763363588037251, "grad_norm": 2.4092280864715576, "learning_rate": 3.6924751305905157e-07, "loss": 1.9399, "num_input_tokens_seen": 453205288, "step": 445100 }, { "epoch": 8.765332440786754, "grad_norm": 1.8316638469696045, "learning_rate": 3.690485183393297e-07, "loss": 1.9581, "num_input_tokens_seen": 453307688, "step": 445200 }, { "epoch": 8.767301293536256, "grad_norm": 2.0612685680389404, "learning_rate": 3.6884954588680254e-07, "loss": 2.0297, "num_input_tokens_seen": 453409512, "step": 445300 }, { "epoch": 8.76927014628576, "grad_norm": 2.094058036804199, "learning_rate": 3.6865059573530427e-07, "loss": 2.013, "num_input_tokens_seen": 453511152, "step": 445400 }, { "epoch": 8.771238999035262, "grad_norm": 1.5972319841384888, "learning_rate": 3.68451667918664e-07, "loss": 1.993, "num_input_tokens_seen": 453613552, "step": 445500 }, { "epoch": 8.773207851784765, "grad_norm": 2.213054895401001, "learning_rate": 3.682527624707082e-07, "loss": 1.9934, "num_input_tokens_seen": 453715472, "step": 445600 }, { "epoch": 8.775176704534267, "grad_norm": 2.0300986766815186, "learning_rate": 3.6805387942525904e-07, "loss": 1.9601, "num_input_tokens_seen": 453817872, "step": 445700 }, { "epoch": 8.77714555728377, "grad_norm": 5.396900177001953, "learning_rate": 3.6785501881613466e-07, "loss": 1.971, "num_input_tokens_seen": 453919544, "step": 445800 }, { "epoch": 8.779114410033273, "grad_norm": 1.8516314029693604, "learning_rate": 3.6765618067714997e-07, "loss": 2.0277, "num_input_tokens_seen": 454021944, "step": 445900 }, { "epoch": 8.781083262782776, "grad_norm": 1.9930393695831299, "learning_rate": 3.6745736504211533e-07, "loss": 1.959, "num_input_tokens_seen": 454123072, "step": 446000 }, { "epoch": 8.783052115532278, "grad_norm": 1.9945868253707886, "learning_rate": 3.67258571944838e-07, "loss": 2.0495, "num_input_tokens_seen": 454225472, "step": 446100 }, { "epoch": 8.785020968281783, "grad_norm": 1.9024966955184937, "learning_rate": 3.670598014191207e-07, "loss": 2.0057, "num_input_tokens_seen": 454327024, "step": 446200 }, { "epoch": 8.786989821031286, "grad_norm": 1.992486596107483, "learning_rate": 3.668610534987631e-07, "loss": 2.0012, "num_input_tokens_seen": 454429424, "step": 446300 }, { "epoch": 8.788958673780789, "grad_norm": 2.2042038440704346, "learning_rate": 3.666623282175603e-07, "loss": 1.9778, "num_input_tokens_seen": 454531272, "step": 446400 }, { "epoch": 8.790927526530291, "grad_norm": 3.83732533454895, "learning_rate": 3.66463625609304e-07, "loss": 2.0007, "num_input_tokens_seen": 454632824, "step": 446500 }, { "epoch": 8.792896379279794, "grad_norm": 1.7048379182815552, "learning_rate": 3.662649457077819e-07, "loss": 1.9609, "num_input_tokens_seen": 454735224, "step": 446600 }, { "epoch": 8.794865232029297, "grad_norm": 1.6984621286392212, "learning_rate": 3.660662885467777e-07, "loss": 1.9816, "num_input_tokens_seen": 454836544, "step": 446700 }, { "epoch": 8.7968340847788, "grad_norm": 2.0316247940063477, "learning_rate": 3.658676541600716e-07, "loss": 1.9409, "num_input_tokens_seen": 454938408, "step": 446800 }, { "epoch": 8.798802937528302, "grad_norm": 2.1081910133361816, "learning_rate": 3.656690425814395e-07, "loss": 1.9658, "num_input_tokens_seen": 455038888, "step": 446900 }, { "epoch": 8.800771790277805, "grad_norm": 1.9483733177185059, "learning_rate": 3.654704538446538e-07, "loss": 2.0191, "num_input_tokens_seen": 455141288, "step": 447000 }, { "epoch": 8.802740643027308, "grad_norm": 3.234713315963745, "learning_rate": 3.652718879834827e-07, "loss": 1.9853, "num_input_tokens_seen": 455243688, "step": 447100 }, { "epoch": 8.80470949577681, "grad_norm": 1.7976369857788086, "learning_rate": 3.650733450316907e-07, "loss": 1.9376, "num_input_tokens_seen": 455345344, "step": 447200 }, { "epoch": 8.806678348526313, "grad_norm": 1.6544246673583984, "learning_rate": 3.6487482502303833e-07, "loss": 1.9435, "num_input_tokens_seen": 455447744, "step": 447300 }, { "epoch": 8.808647201275816, "grad_norm": 1.76790452003479, "learning_rate": 3.6467632799128236e-07, "loss": 1.981, "num_input_tokens_seen": 455550144, "step": 447400 }, { "epoch": 8.810616054025319, "grad_norm": 1.8400684595108032, "learning_rate": 3.644778539701753e-07, "loss": 1.9773, "num_input_tokens_seen": 455651888, "step": 447500 }, { "epoch": 8.812584906774822, "grad_norm": 1.904646873474121, "learning_rate": 3.64279402993466e-07, "loss": 1.9986, "num_input_tokens_seen": 455753608, "step": 447600 }, { "epoch": 8.814553759524324, "grad_norm": 2.317580461502075, "learning_rate": 3.640809750949e-07, "loss": 1.9769, "num_input_tokens_seen": 455856008, "step": 447700 }, { "epoch": 8.816522612273829, "grad_norm": 1.7815885543823242, "learning_rate": 3.638825703082173e-07, "loss": 2.0695, "num_input_tokens_seen": 455958408, "step": 447800 }, { "epoch": 8.818491465023332, "grad_norm": 1.9623051881790161, "learning_rate": 3.636841886671558e-07, "loss": 1.9935, "num_input_tokens_seen": 456060496, "step": 447900 }, { "epoch": 8.820460317772834, "grad_norm": 1.8556016683578491, "learning_rate": 3.63485830205448e-07, "loss": 1.9769, "num_input_tokens_seen": 456161528, "step": 448000 }, { "epoch": 8.822429170522337, "grad_norm": 1.9822044372558594, "learning_rate": 3.632874949568236e-07, "loss": 2.0191, "num_input_tokens_seen": 456263512, "step": 448100 }, { "epoch": 8.82439802327184, "grad_norm": 1.9502639770507812, "learning_rate": 3.6308918295500736e-07, "loss": 1.9856, "num_input_tokens_seen": 456365032, "step": 448200 }, { "epoch": 8.826366876021343, "grad_norm": 3.6109819412231445, "learning_rate": 3.6289089423372106e-07, "loss": 2.014, "num_input_tokens_seen": 456463664, "step": 448300 }, { "epoch": 8.828335728770845, "grad_norm": 2.757236957550049, "learning_rate": 3.626926288266814e-07, "loss": 2.0203, "num_input_tokens_seen": 456565664, "step": 448400 }, { "epoch": 8.830304581520348, "grad_norm": 1.8749210834503174, "learning_rate": 3.624943867676026e-07, "loss": 1.9957, "num_input_tokens_seen": 456667120, "step": 448500 }, { "epoch": 8.832273434269851, "grad_norm": 1.7945029735565186, "learning_rate": 3.6229616809019305e-07, "loss": 1.9833, "num_input_tokens_seen": 456769520, "step": 448600 }, { "epoch": 8.834242287019354, "grad_norm": 2.066924810409546, "learning_rate": 3.620979728281588e-07, "loss": 2.0258, "num_input_tokens_seen": 456869608, "step": 448700 }, { "epoch": 8.836211139768857, "grad_norm": 1.9594933986663818, "learning_rate": 3.6189980101520133e-07, "loss": 2.0104, "num_input_tokens_seen": 456970768, "step": 448800 }, { "epoch": 8.83817999251836, "grad_norm": 1.7667531967163086, "learning_rate": 3.6170165268501786e-07, "loss": 1.9563, "num_input_tokens_seen": 457073168, "step": 448900 }, { "epoch": 8.840148845267862, "grad_norm": 1.9170072078704834, "learning_rate": 3.615035278713019e-07, "loss": 2.0247, "num_input_tokens_seen": 457174920, "step": 449000 }, { "epoch": 8.842117698017365, "grad_norm": 2.301262617111206, "learning_rate": 3.613054266077428e-07, "loss": 1.9894, "num_input_tokens_seen": 457276104, "step": 449100 }, { "epoch": 8.844086550766868, "grad_norm": 2.0854878425598145, "learning_rate": 3.611073489280262e-07, "loss": 2.0195, "num_input_tokens_seen": 457378504, "step": 449200 }, { "epoch": 8.84605540351637, "grad_norm": 2.1191959381103516, "learning_rate": 3.609092948658333e-07, "loss": 2.0333, "num_input_tokens_seen": 457479344, "step": 449300 }, { "epoch": 8.848024256265873, "grad_norm": 2.0031979084014893, "learning_rate": 3.607112644548418e-07, "loss": 1.9964, "num_input_tokens_seen": 457580864, "step": 449400 }, { "epoch": 8.849993109015376, "grad_norm": 2.0754759311676025, "learning_rate": 3.605132577287249e-07, "loss": 2.0488, "num_input_tokens_seen": 457682344, "step": 449500 }, { "epoch": 8.85196196176488, "grad_norm": 1.924009919166565, "learning_rate": 3.60315274721152e-07, "loss": 1.9409, "num_input_tokens_seen": 457784744, "step": 449600 }, { "epoch": 8.853930814514383, "grad_norm": 2.0600244998931885, "learning_rate": 3.601173154657885e-07, "loss": 2.0058, "num_input_tokens_seen": 457887040, "step": 449700 }, { "epoch": 8.855899667263886, "grad_norm": 1.9509413242340088, "learning_rate": 3.5991937999629575e-07, "loss": 1.976, "num_input_tokens_seen": 457988544, "step": 449800 }, { "epoch": 8.857868520013389, "grad_norm": 2.0268354415893555, "learning_rate": 3.5972146834633094e-07, "loss": 1.9861, "num_input_tokens_seen": 458090944, "step": 449900 }, { "epoch": 8.859837372762891, "grad_norm": 1.9515221118927002, "learning_rate": 3.595235805495472e-07, "loss": 1.9914, "num_input_tokens_seen": 458193344, "step": 450000 }, { "epoch": 8.861806225512394, "grad_norm": 1.9911466836929321, "learning_rate": 3.593257166395938e-07, "loss": 2.0072, "num_input_tokens_seen": 458294048, "step": 450100 }, { "epoch": 8.863775078261897, "grad_norm": 1.843186616897583, "learning_rate": 3.591278766501157e-07, "loss": 1.9979, "num_input_tokens_seen": 458396272, "step": 450200 }, { "epoch": 8.8657439310114, "grad_norm": 1.930181622505188, "learning_rate": 3.589300606147542e-07, "loss": 2.0004, "num_input_tokens_seen": 458497200, "step": 450300 }, { "epoch": 8.867712783760902, "grad_norm": 1.896479606628418, "learning_rate": 3.5873226856714584e-07, "loss": 1.9616, "num_input_tokens_seen": 458598408, "step": 450400 }, { "epoch": 8.869681636510405, "grad_norm": 2.050766706466675, "learning_rate": 3.585345005409238e-07, "loss": 1.9723, "num_input_tokens_seen": 458700808, "step": 450500 }, { "epoch": 8.871650489259908, "grad_norm": 1.9739941358566284, "learning_rate": 3.5833675656971675e-07, "loss": 1.9593, "num_input_tokens_seen": 458803208, "step": 450600 }, { "epoch": 8.87361934200941, "grad_norm": 1.6464823484420776, "learning_rate": 3.581390366871493e-07, "loss": 2.0061, "num_input_tokens_seen": 458905608, "step": 450700 }, { "epoch": 8.875588194758913, "grad_norm": 1.916754126548767, "learning_rate": 3.579413409268424e-07, "loss": 1.945, "num_input_tokens_seen": 459007408, "step": 450800 }, { "epoch": 8.877557047508416, "grad_norm": 1.9122352600097656, "learning_rate": 3.5774366932241197e-07, "loss": 2.0026, "num_input_tokens_seen": 459109368, "step": 450900 }, { "epoch": 8.879525900257919, "grad_norm": 1.735640525817871, "learning_rate": 3.5754602190747107e-07, "loss": 1.956, "num_input_tokens_seen": 459211768, "step": 451000 }, { "epoch": 8.881494753007422, "grad_norm": 1.6875808238983154, "learning_rate": 3.573483987156274e-07, "loss": 1.9439, "num_input_tokens_seen": 459312560, "step": 451100 }, { "epoch": 8.883463605756926, "grad_norm": 1.9245717525482178, "learning_rate": 3.5715079978048557e-07, "loss": 1.9483, "num_input_tokens_seen": 459414960, "step": 451200 }, { "epoch": 8.885432458506429, "grad_norm": 1.9336355924606323, "learning_rate": 3.5695322513564505e-07, "loss": 1.9446, "num_input_tokens_seen": 459517360, "step": 451300 }, { "epoch": 8.887401311255932, "grad_norm": 2.0105578899383545, "learning_rate": 3.567556748147025e-07, "loss": 2.0121, "num_input_tokens_seen": 459618216, "step": 451400 }, { "epoch": 8.889370164005435, "grad_norm": 2.152290105819702, "learning_rate": 3.565581488512489e-07, "loss": 1.9641, "num_input_tokens_seen": 459719928, "step": 451500 }, { "epoch": 8.891339016754937, "grad_norm": 1.7344796657562256, "learning_rate": 3.5636064727887227e-07, "loss": 2.0073, "num_input_tokens_seen": 459821088, "step": 451600 }, { "epoch": 8.89330786950444, "grad_norm": 2.3190789222717285, "learning_rate": 3.5616317013115627e-07, "loss": 2.0011, "num_input_tokens_seen": 459923008, "step": 451700 }, { "epoch": 8.895276722253943, "grad_norm": 2.0082907676696777, "learning_rate": 3.5596571744167994e-07, "loss": 1.9735, "num_input_tokens_seen": 460024416, "step": 451800 }, { "epoch": 8.897245575003446, "grad_norm": 1.642967939376831, "learning_rate": 3.5576828924401855e-07, "loss": 1.9947, "num_input_tokens_seen": 460126816, "step": 451900 }, { "epoch": 8.899214427752948, "grad_norm": 1.9961860179901123, "learning_rate": 3.55570885571743e-07, "loss": 1.9902, "num_input_tokens_seen": 460228528, "step": 452000 }, { "epoch": 8.901183280502451, "grad_norm": 1.9979910850524902, "learning_rate": 3.553735064584202e-07, "loss": 1.978, "num_input_tokens_seen": 460330512, "step": 452100 }, { "epoch": 8.903152133251954, "grad_norm": 1.8616020679473877, "learning_rate": 3.5517615193761274e-07, "loss": 1.9982, "num_input_tokens_seen": 460432912, "step": 452200 }, { "epoch": 8.905120986001457, "grad_norm": 2.3574740886688232, "learning_rate": 3.5497882204287933e-07, "loss": 1.9914, "num_input_tokens_seen": 460534880, "step": 452300 }, { "epoch": 8.90708983875096, "grad_norm": 1.9747241735458374, "learning_rate": 3.5478151680777387e-07, "loss": 1.9967, "num_input_tokens_seen": 460637280, "step": 452400 }, { "epoch": 8.909058691500462, "grad_norm": 1.8633328676223755, "learning_rate": 3.5458423626584675e-07, "loss": 2.0346, "num_input_tokens_seen": 460739680, "step": 452500 }, { "epoch": 8.911027544249965, "grad_norm": 1.8827803134918213, "learning_rate": 3.543869804506437e-07, "loss": 2.0016, "num_input_tokens_seen": 460840632, "step": 452600 }, { "epoch": 8.912996396999468, "grad_norm": 1.8843450546264648, "learning_rate": 3.5418974939570644e-07, "loss": 1.9751, "num_input_tokens_seen": 460943032, "step": 452700 }, { "epoch": 8.91496524974897, "grad_norm": 2.1894655227661133, "learning_rate": 3.539925431345726e-07, "loss": 2.0151, "num_input_tokens_seen": 461044488, "step": 452800 }, { "epoch": 8.916934102498475, "grad_norm": 2.1176750659942627, "learning_rate": 3.537953617007752e-07, "loss": 1.9507, "num_input_tokens_seen": 461146888, "step": 452900 }, { "epoch": 8.918902955247978, "grad_norm": 1.8750076293945312, "learning_rate": 3.5359820512784347e-07, "loss": 2.0159, "num_input_tokens_seen": 461249288, "step": 453000 }, { "epoch": 8.92087180799748, "grad_norm": 1.8658981323242188, "learning_rate": 3.5340107344930206e-07, "loss": 1.9918, "num_input_tokens_seen": 461351688, "step": 453100 }, { "epoch": 8.922840660746983, "grad_norm": 1.9649168252944946, "learning_rate": 3.532039666986718e-07, "loss": 1.9929, "num_input_tokens_seen": 461454088, "step": 453200 }, { "epoch": 8.924809513496486, "grad_norm": 2.112973213195801, "learning_rate": 3.5300688490946873e-07, "loss": 1.9731, "num_input_tokens_seen": 461556488, "step": 453300 }, { "epoch": 8.926778366245989, "grad_norm": 1.8056161403656006, "learning_rate": 3.5280982811520517e-07, "loss": 1.9509, "num_input_tokens_seen": 461658888, "step": 453400 }, { "epoch": 8.928747218995492, "grad_norm": 2.0748400688171387, "learning_rate": 3.5261279634938877e-07, "loss": 1.9776, "num_input_tokens_seen": 461761288, "step": 453500 }, { "epoch": 8.930716071744994, "grad_norm": 1.9434674978256226, "learning_rate": 3.5241578964552333e-07, "loss": 1.9595, "num_input_tokens_seen": 461863096, "step": 453600 }, { "epoch": 8.932684924494497, "grad_norm": 1.6872519254684448, "learning_rate": 3.522188080371079e-07, "loss": 1.9883, "num_input_tokens_seen": 461965496, "step": 453700 }, { "epoch": 8.934653777244, "grad_norm": 1.85960853099823, "learning_rate": 3.5202185155763783e-07, "loss": 1.9884, "num_input_tokens_seen": 462067896, "step": 453800 }, { "epoch": 8.936622629993503, "grad_norm": 2.009847640991211, "learning_rate": 3.5182492024060383e-07, "loss": 2.0315, "num_input_tokens_seen": 462170296, "step": 453900 }, { "epoch": 8.938591482743005, "grad_norm": 2.876405715942383, "learning_rate": 3.5162801411949234e-07, "loss": 1.9948, "num_input_tokens_seen": 462272696, "step": 454000 }, { "epoch": 8.940560335492508, "grad_norm": 1.9534741640090942, "learning_rate": 3.5143113322778574e-07, "loss": 2.0359, "num_input_tokens_seen": 462374432, "step": 454100 }, { "epoch": 8.94252918824201, "grad_norm": 1.8362634181976318, "learning_rate": 3.5123427759896167e-07, "loss": 1.9737, "num_input_tokens_seen": 462476064, "step": 454200 }, { "epoch": 8.944498040991514, "grad_norm": 2.1819591522216797, "learning_rate": 3.510374472664941e-07, "loss": 1.9971, "num_input_tokens_seen": 462578464, "step": 454300 }, { "epoch": 8.946466893741016, "grad_norm": 1.9293622970581055, "learning_rate": 3.508406422638521e-07, "loss": 2.0277, "num_input_tokens_seen": 462679216, "step": 454400 }, { "epoch": 8.94843574649052, "grad_norm": 2.0786991119384766, "learning_rate": 3.50643862624501e-07, "loss": 2.0097, "num_input_tokens_seen": 462781616, "step": 454500 }, { "epoch": 8.950404599240024, "grad_norm": 1.8562805652618408, "learning_rate": 3.5044710838190107e-07, "loss": 1.9723, "num_input_tokens_seen": 462883488, "step": 454600 }, { "epoch": 8.952373451989526, "grad_norm": 2.069399356842041, "learning_rate": 3.502503795695091e-07, "loss": 1.9541, "num_input_tokens_seen": 462985888, "step": 454700 }, { "epoch": 8.95434230473903, "grad_norm": 1.9615408182144165, "learning_rate": 3.500536762207771e-07, "loss": 2.0205, "num_input_tokens_seen": 463088240, "step": 454800 }, { "epoch": 8.956311157488532, "grad_norm": 2.12089467048645, "learning_rate": 3.4985699836915263e-07, "loss": 2.0027, "num_input_tokens_seen": 463188960, "step": 454900 }, { "epoch": 8.958280010238035, "grad_norm": 1.8620356321334839, "learning_rate": 3.496603460480793e-07, "loss": 1.9786, "num_input_tokens_seen": 463289064, "step": 455000 }, { "epoch": 8.960248862987537, "grad_norm": 1.6644980907440186, "learning_rate": 3.4946371929099605e-07, "loss": 1.9584, "num_input_tokens_seen": 463391464, "step": 455100 }, { "epoch": 8.96221771573704, "grad_norm": 1.7266654968261719, "learning_rate": 3.4926711813133773e-07, "loss": 1.9637, "num_input_tokens_seen": 463491528, "step": 455200 }, { "epoch": 8.964186568486543, "grad_norm": 2.0514442920684814, "learning_rate": 3.4907054260253455e-07, "loss": 2.0351, "num_input_tokens_seen": 463593008, "step": 455300 }, { "epoch": 8.966155421236046, "grad_norm": 1.8872276544570923, "learning_rate": 3.488739927380127e-07, "loss": 1.9595, "num_input_tokens_seen": 463694592, "step": 455400 }, { "epoch": 8.968124273985548, "grad_norm": 2.1632421016693115, "learning_rate": 3.4867746857119365e-07, "loss": 1.9792, "num_input_tokens_seen": 463796992, "step": 455500 }, { "epoch": 8.970093126735051, "grad_norm": 1.9777168035507202, "learning_rate": 3.484809701354948e-07, "loss": 2.0071, "num_input_tokens_seen": 463898024, "step": 455600 }, { "epoch": 8.972061979484554, "grad_norm": 2.0591378211975098, "learning_rate": 3.4828449746432896e-07, "loss": 1.9898, "num_input_tokens_seen": 463999600, "step": 455700 }, { "epoch": 8.974030832234057, "grad_norm": 1.902843952178955, "learning_rate": 3.480880505911046e-07, "loss": 1.998, "num_input_tokens_seen": 464101824, "step": 455800 }, { "epoch": 8.97599968498356, "grad_norm": 1.7229907512664795, "learning_rate": 3.478916295492262e-07, "loss": 2.0109, "num_input_tokens_seen": 464203464, "step": 455900 }, { "epoch": 8.977968537733062, "grad_norm": 1.8738839626312256, "learning_rate": 3.47695234372093e-07, "loss": 2.0182, "num_input_tokens_seen": 464305472, "step": 456000 }, { "epoch": 8.979937390482565, "grad_norm": 2.030946969985962, "learning_rate": 3.474988650931009e-07, "loss": 1.9661, "num_input_tokens_seen": 464407872, "step": 456100 }, { "epoch": 8.981906243232068, "grad_norm": 1.7908202409744263, "learning_rate": 3.4730252174564017e-07, "loss": 1.9571, "num_input_tokens_seen": 464510272, "step": 456200 }, { "epoch": 8.983875095981572, "grad_norm": 2.0152580738067627, "learning_rate": 3.471062043630981e-07, "loss": 2.0023, "num_input_tokens_seen": 464612672, "step": 456300 }, { "epoch": 8.985843948731075, "grad_norm": 1.8081436157226562, "learning_rate": 3.46909912978856e-07, "loss": 1.9853, "num_input_tokens_seen": 464715064, "step": 456400 }, { "epoch": 8.987812801480578, "grad_norm": 1.9940552711486816, "learning_rate": 3.467136476262924e-07, "loss": 1.9396, "num_input_tokens_seen": 464816272, "step": 456500 }, { "epoch": 8.98978165423008, "grad_norm": 1.7960771322250366, "learning_rate": 3.4651740833877975e-07, "loss": 2.019, "num_input_tokens_seen": 464917816, "step": 456600 }, { "epoch": 8.991750506979583, "grad_norm": 1.9710174798965454, "learning_rate": 3.4632119514968744e-07, "loss": 2.0066, "num_input_tokens_seen": 465018928, "step": 456700 }, { "epoch": 8.993719359729086, "grad_norm": 1.9642566442489624, "learning_rate": 3.461250080923798e-07, "loss": 1.9996, "num_input_tokens_seen": 465120168, "step": 456800 }, { "epoch": 8.995688212478589, "grad_norm": 1.924302339553833, "learning_rate": 3.4592884720021654e-07, "loss": 1.9869, "num_input_tokens_seen": 465221984, "step": 456900 }, { "epoch": 8.997657065228092, "grad_norm": 1.79349946975708, "learning_rate": 3.4573271250655345e-07, "loss": 1.9616, "num_input_tokens_seen": 465323832, "step": 457000 }, { "epoch": 8.999625917977594, "grad_norm": 1.846758484840393, "learning_rate": 3.4553660404474136e-07, "loss": 1.9666, "num_input_tokens_seen": 465425696, "step": 457100 }, { "epoch": 9.001594770727097, "grad_norm": 3.0008890628814697, "learning_rate": 3.453405218481271e-07, "loss": 2.0245, "num_input_tokens_seen": 465527952, "step": 457200 }, { "epoch": 9.0035636234766, "grad_norm": 1.8662006855010986, "learning_rate": 3.451444659500524e-07, "loss": 1.9865, "num_input_tokens_seen": 465630352, "step": 457300 }, { "epoch": 9.005532476226103, "grad_norm": 1.9708071947097778, "learning_rate": 3.449484363838554e-07, "loss": 1.9751, "num_input_tokens_seen": 465732208, "step": 457400 }, { "epoch": 9.007501328975605, "grad_norm": 1.8690742254257202, "learning_rate": 3.4475243318286885e-07, "loss": 1.9877, "num_input_tokens_seen": 465834608, "step": 457500 }, { "epoch": 9.009470181725108, "grad_norm": 1.891236424446106, "learning_rate": 3.4455645638042173e-07, "loss": 1.9947, "num_input_tokens_seen": 465935952, "step": 457600 }, { "epoch": 9.011439034474611, "grad_norm": 1.934617519378662, "learning_rate": 3.44360506009838e-07, "loss": 1.967, "num_input_tokens_seen": 466037480, "step": 457700 }, { "epoch": 9.013407887224114, "grad_norm": 2.242408037185669, "learning_rate": 3.4416458210443755e-07, "loss": 2.0061, "num_input_tokens_seen": 466138688, "step": 457800 }, { "epoch": 9.015376739973618, "grad_norm": 2.3857059478759766, "learning_rate": 3.439686846975357e-07, "loss": 1.9726, "num_input_tokens_seen": 466239656, "step": 457900 }, { "epoch": 9.017345592723121, "grad_norm": 1.8627309799194336, "learning_rate": 3.437728138224428e-07, "loss": 1.9786, "num_input_tokens_seen": 466340224, "step": 458000 }, { "epoch": 9.019314445472624, "grad_norm": 1.5528455972671509, "learning_rate": 3.435769695124653e-07, "loss": 1.9627, "num_input_tokens_seen": 466442104, "step": 458100 }, { "epoch": 9.021283298222126, "grad_norm": 2.029254674911499, "learning_rate": 3.433811518009048e-07, "loss": 1.9436, "num_input_tokens_seen": 466544504, "step": 458200 }, { "epoch": 9.02325215097163, "grad_norm": 2.0019540786743164, "learning_rate": 3.431853607210584e-07, "loss": 1.9877, "num_input_tokens_seen": 466646264, "step": 458300 }, { "epoch": 9.025221003721132, "grad_norm": 2.2268126010894775, "learning_rate": 3.429895963062188e-07, "loss": 1.9771, "num_input_tokens_seen": 466748064, "step": 458400 }, { "epoch": 9.027189856470635, "grad_norm": 2.0140953063964844, "learning_rate": 3.4279385858967414e-07, "loss": 1.991, "num_input_tokens_seen": 466850464, "step": 458500 }, { "epoch": 9.029158709220138, "grad_norm": 1.559033989906311, "learning_rate": 3.4259814760470774e-07, "loss": 2.0166, "num_input_tokens_seen": 466952864, "step": 458600 }, { "epoch": 9.03112756196964, "grad_norm": 2.0017471313476562, "learning_rate": 3.424024633845989e-07, "loss": 1.9757, "num_input_tokens_seen": 467054560, "step": 458700 }, { "epoch": 9.033096414719143, "grad_norm": 2.088620185852051, "learning_rate": 3.4220680596262186e-07, "loss": 2.0129, "num_input_tokens_seen": 467156424, "step": 458800 }, { "epoch": 9.035065267468646, "grad_norm": 1.869077205657959, "learning_rate": 3.4201117537204636e-07, "loss": 1.9643, "num_input_tokens_seen": 467258824, "step": 458900 }, { "epoch": 9.037034120218149, "grad_norm": 1.6668530702590942, "learning_rate": 3.4181557164613833e-07, "loss": 1.9674, "num_input_tokens_seen": 467360768, "step": 459000 }, { "epoch": 9.039002972967651, "grad_norm": 2.469242572784424, "learning_rate": 3.4161999481815785e-07, "loss": 1.9679, "num_input_tokens_seen": 467462536, "step": 459100 }, { "epoch": 9.040971825717154, "grad_norm": 4.868458271026611, "learning_rate": 3.414244449213617e-07, "loss": 1.9877, "num_input_tokens_seen": 467563664, "step": 459200 }, { "epoch": 9.042940678466657, "grad_norm": 1.9527435302734375, "learning_rate": 3.412289219890009e-07, "loss": 2.0316, "num_input_tokens_seen": 467663904, "step": 459300 }, { "epoch": 9.04490953121616, "grad_norm": 1.9218140840530396, "learning_rate": 3.4103342605432305e-07, "loss": 1.9859, "num_input_tokens_seen": 467765456, "step": 459400 }, { "epoch": 9.046878383965662, "grad_norm": 1.7027376890182495, "learning_rate": 3.408379571505701e-07, "loss": 1.9911, "num_input_tokens_seen": 467867856, "step": 459500 }, { "epoch": 9.048847236715167, "grad_norm": 2.346632480621338, "learning_rate": 3.4064251531098034e-07, "loss": 1.9657, "num_input_tokens_seen": 467968848, "step": 459600 }, { "epoch": 9.05081608946467, "grad_norm": 1.9321973323822021, "learning_rate": 3.4044710056878646e-07, "loss": 1.9945, "num_input_tokens_seen": 468071248, "step": 459700 }, { "epoch": 9.052784942214172, "grad_norm": 1.9816904067993164, "learning_rate": 3.402517129572175e-07, "loss": 2.0039, "num_input_tokens_seen": 468173648, "step": 459800 }, { "epoch": 9.054753794963675, "grad_norm": 2.163686513900757, "learning_rate": 3.400563525094975e-07, "loss": 2.0248, "num_input_tokens_seen": 468275312, "step": 459900 }, { "epoch": 9.056722647713178, "grad_norm": 1.7017338275909424, "learning_rate": 3.3986101925884556e-07, "loss": 1.9548, "num_input_tokens_seen": 468377216, "step": 460000 }, { "epoch": 9.05869150046268, "grad_norm": 1.8634601831436157, "learning_rate": 3.396657132384768e-07, "loss": 2.0347, "num_input_tokens_seen": 468479064, "step": 460100 }, { "epoch": 9.060660353212183, "grad_norm": 1.8698630332946777, "learning_rate": 3.3947043448160106e-07, "loss": 1.9563, "num_input_tokens_seen": 468580176, "step": 460200 }, { "epoch": 9.062629205961686, "grad_norm": 1.9434471130371094, "learning_rate": 3.392751830214241e-07, "loss": 1.9627, "num_input_tokens_seen": 468681344, "step": 460300 }, { "epoch": 9.064598058711189, "grad_norm": 3.5381059646606445, "learning_rate": 3.3907995889114647e-07, "loss": 1.9417, "num_input_tokens_seen": 468783744, "step": 460400 }, { "epoch": 9.066566911460692, "grad_norm": 1.76524817943573, "learning_rate": 3.388847621239648e-07, "loss": 1.9711, "num_input_tokens_seen": 468886144, "step": 460500 }, { "epoch": 9.068535764210194, "grad_norm": 2.0872108936309814, "learning_rate": 3.386895927530702e-07, "loss": 1.9846, "num_input_tokens_seen": 468988544, "step": 460600 }, { "epoch": 9.070504616959697, "grad_norm": 1.856087327003479, "learning_rate": 3.384944508116499e-07, "loss": 1.9648, "num_input_tokens_seen": 469090944, "step": 460700 }, { "epoch": 9.0724734697092, "grad_norm": 2.1265947818756104, "learning_rate": 3.38299336332886e-07, "loss": 1.9657, "num_input_tokens_seen": 469192808, "step": 460800 }, { "epoch": 9.074442322458703, "grad_norm": 1.7705391645431519, "learning_rate": 3.3810424934995605e-07, "loss": 1.9983, "num_input_tokens_seen": 469293984, "step": 460900 }, { "epoch": 9.076411175208206, "grad_norm": 2.031543493270874, "learning_rate": 3.379091898960331e-07, "loss": 1.9514, "num_input_tokens_seen": 469396384, "step": 461000 }, { "epoch": 9.078380027957708, "grad_norm": 1.767552375793457, "learning_rate": 3.3771415800428527e-07, "loss": 1.9298, "num_input_tokens_seen": 469498784, "step": 461100 }, { "epoch": 9.080348880707213, "grad_norm": 2.2106571197509766, "learning_rate": 3.375191537078761e-07, "loss": 1.9784, "num_input_tokens_seen": 469600832, "step": 461200 }, { "epoch": 9.082317733456716, "grad_norm": 1.9160747528076172, "learning_rate": 3.3732417703996435e-07, "loss": 2.0515, "num_input_tokens_seen": 469703232, "step": 461300 }, { "epoch": 9.084286586206218, "grad_norm": 2.072680950164795, "learning_rate": 3.3712922803370435e-07, "loss": 2.0081, "num_input_tokens_seen": 469804048, "step": 461400 }, { "epoch": 9.086255438955721, "grad_norm": 1.9819695949554443, "learning_rate": 3.369343067222452e-07, "loss": 2.025, "num_input_tokens_seen": 469906448, "step": 461500 }, { "epoch": 9.088224291705224, "grad_norm": 1.9884424209594727, "learning_rate": 3.3673941313873206e-07, "loss": 1.9767, "num_input_tokens_seen": 470008256, "step": 461600 }, { "epoch": 9.090193144454727, "grad_norm": 1.9748448133468628, "learning_rate": 3.3654454731630455e-07, "loss": 1.9599, "num_input_tokens_seen": 470109848, "step": 461700 }, { "epoch": 9.09216199720423, "grad_norm": 2.1398630142211914, "learning_rate": 3.36349709288098e-07, "loss": 1.9951, "num_input_tokens_seen": 470212248, "step": 461800 }, { "epoch": 9.094130849953732, "grad_norm": 2.4477145671844482, "learning_rate": 3.361548990872435e-07, "loss": 1.937, "num_input_tokens_seen": 470313776, "step": 461900 }, { "epoch": 9.096099702703235, "grad_norm": 2.2335894107818604, "learning_rate": 3.359601167468661e-07, "loss": 2.0437, "num_input_tokens_seen": 470415696, "step": 462000 }, { "epoch": 9.098068555452738, "grad_norm": 2.278855085372925, "learning_rate": 3.3576536230008747e-07, "loss": 2.0122, "num_input_tokens_seen": 470517224, "step": 462100 }, { "epoch": 9.10003740820224, "grad_norm": 1.8019264936447144, "learning_rate": 3.3557063578002365e-07, "loss": 1.9621, "num_input_tokens_seen": 470619624, "step": 462200 }, { "epoch": 9.102006260951743, "grad_norm": 1.8344491720199585, "learning_rate": 3.353759372197865e-07, "loss": 1.9552, "num_input_tokens_seen": 470722024, "step": 462300 }, { "epoch": 9.103975113701246, "grad_norm": 1.8435543775558472, "learning_rate": 3.351812666524826e-07, "loss": 1.9675, "num_input_tokens_seen": 470823176, "step": 462400 }, { "epoch": 9.105943966450749, "grad_norm": 1.8911259174346924, "learning_rate": 3.349866241112142e-07, "loss": 2.0105, "num_input_tokens_seen": 470924648, "step": 462500 }, { "epoch": 9.107912819200251, "grad_norm": 2.0813772678375244, "learning_rate": 3.347920096290785e-07, "loss": 2.0098, "num_input_tokens_seen": 471027048, "step": 462600 }, { "epoch": 9.109881671949754, "grad_norm": 2.155921459197998, "learning_rate": 3.345974232391683e-07, "loss": 1.9851, "num_input_tokens_seen": 471128008, "step": 462700 }, { "epoch": 9.111850524699257, "grad_norm": 2.0911948680877686, "learning_rate": 3.344028649745711e-07, "loss": 2.0088, "num_input_tokens_seen": 471229656, "step": 462800 }, { "epoch": 9.113819377448761, "grad_norm": 1.9464905261993408, "learning_rate": 3.3420833486837e-07, "loss": 2.0201, "num_input_tokens_seen": 471332056, "step": 462900 }, { "epoch": 9.115788230198264, "grad_norm": 1.860300064086914, "learning_rate": 3.3401383295364326e-07, "loss": 2.0, "num_input_tokens_seen": 471434456, "step": 463000 }, { "epoch": 9.117757082947767, "grad_norm": 1.978712558746338, "learning_rate": 3.3381935926346416e-07, "loss": 1.9544, "num_input_tokens_seen": 471536216, "step": 463100 }, { "epoch": 9.11972593569727, "grad_norm": 1.672761082649231, "learning_rate": 3.3362491383090144e-07, "loss": 1.9781, "num_input_tokens_seen": 471638144, "step": 463200 }, { "epoch": 9.121694788446773, "grad_norm": 2.275247097015381, "learning_rate": 3.334304966890188e-07, "loss": 1.9692, "num_input_tokens_seen": 471740544, "step": 463300 }, { "epoch": 9.123663641196275, "grad_norm": 1.9788198471069336, "learning_rate": 3.3323610787087544e-07, "loss": 2.0102, "num_input_tokens_seen": 471842408, "step": 463400 }, { "epoch": 9.125632493945778, "grad_norm": 2.262812376022339, "learning_rate": 3.330417474095252e-07, "loss": 2.0388, "num_input_tokens_seen": 471943608, "step": 463500 }, { "epoch": 9.12760134669528, "grad_norm": 2.114628791809082, "learning_rate": 3.328474153380177e-07, "loss": 2.0586, "num_input_tokens_seen": 472044560, "step": 463600 }, { "epoch": 9.129570199444784, "grad_norm": 1.8643527030944824, "learning_rate": 3.326531116893974e-07, "loss": 2.0018, "num_input_tokens_seen": 472145456, "step": 463700 }, { "epoch": 9.131539052194286, "grad_norm": 2.0743651390075684, "learning_rate": 3.324588364967038e-07, "loss": 2.0114, "num_input_tokens_seen": 472246504, "step": 463800 }, { "epoch": 9.133507904943789, "grad_norm": 1.91729736328125, "learning_rate": 3.3226458979297224e-07, "loss": 2.0308, "num_input_tokens_seen": 472348344, "step": 463900 }, { "epoch": 9.135476757693292, "grad_norm": 2.067162275314331, "learning_rate": 3.320703716112322e-07, "loss": 1.9563, "num_input_tokens_seen": 472450744, "step": 464000 }, { "epoch": 9.137445610442795, "grad_norm": 1.8478366136550903, "learning_rate": 3.3187618198450935e-07, "loss": 1.9685, "num_input_tokens_seen": 472552648, "step": 464100 }, { "epoch": 9.139414463192297, "grad_norm": 1.784961462020874, "learning_rate": 3.3168202094582344e-07, "loss": 2.0024, "num_input_tokens_seen": 472654536, "step": 464200 }, { "epoch": 9.1413833159418, "grad_norm": 1.885314702987671, "learning_rate": 3.3148788852819056e-07, "loss": 1.9917, "num_input_tokens_seen": 472755744, "step": 464300 }, { "epoch": 9.143352168691303, "grad_norm": 1.6667174100875854, "learning_rate": 3.3129378476462057e-07, "loss": 1.9891, "num_input_tokens_seen": 472858144, "step": 464400 }, { "epoch": 9.145321021440806, "grad_norm": 1.8386411666870117, "learning_rate": 3.310997096881198e-07, "loss": 1.9967, "num_input_tokens_seen": 472959768, "step": 464500 }, { "epoch": 9.14728987419031, "grad_norm": 1.8697896003723145, "learning_rate": 3.309056633316886e-07, "loss": 1.9958, "num_input_tokens_seen": 473062168, "step": 464600 }, { "epoch": 9.149258726939813, "grad_norm": 2.16806697845459, "learning_rate": 3.3071164572832346e-07, "loss": 2.0571, "num_input_tokens_seen": 473163344, "step": 464700 }, { "epoch": 9.151227579689316, "grad_norm": 1.8884488344192505, "learning_rate": 3.3051765691101474e-07, "loss": 1.9225, "num_input_tokens_seen": 473265112, "step": 464800 }, { "epoch": 9.153196432438818, "grad_norm": 1.9401971101760864, "learning_rate": 3.303236969127492e-07, "loss": 1.9586, "num_input_tokens_seen": 473367512, "step": 464900 }, { "epoch": 9.155165285188321, "grad_norm": 1.9680757522583008, "learning_rate": 3.3012976576650785e-07, "loss": 1.9973, "num_input_tokens_seen": 473469912, "step": 465000 }, { "epoch": 9.157134137937824, "grad_norm": 2.009850025177002, "learning_rate": 3.299358635052671e-07, "loss": 1.9918, "num_input_tokens_seen": 473571536, "step": 465100 }, { "epoch": 9.159102990687327, "grad_norm": 1.7671650648117065, "learning_rate": 3.2974199016199844e-07, "loss": 1.9498, "num_input_tokens_seen": 473673936, "step": 465200 }, { "epoch": 9.16107184343683, "grad_norm": 2.0762922763824463, "learning_rate": 3.2954814576966816e-07, "loss": 1.9651, "num_input_tokens_seen": 473775760, "step": 465300 }, { "epoch": 9.163040696186332, "grad_norm": 1.9856797456741333, "learning_rate": 3.293543303612382e-07, "loss": 1.9336, "num_input_tokens_seen": 473878160, "step": 465400 }, { "epoch": 9.165009548935835, "grad_norm": 1.8416602611541748, "learning_rate": 3.29160543969665e-07, "loss": 1.9652, "num_input_tokens_seen": 473979912, "step": 465500 }, { "epoch": 9.166978401685338, "grad_norm": 1.8863821029663086, "learning_rate": 3.289667866279004e-07, "loss": 1.9712, "num_input_tokens_seen": 474082312, "step": 465600 }, { "epoch": 9.16894725443484, "grad_norm": 1.877996802330017, "learning_rate": 3.2877305836889113e-07, "loss": 2.044, "num_input_tokens_seen": 474183328, "step": 465700 }, { "epoch": 9.170916107184343, "grad_norm": 1.847738265991211, "learning_rate": 3.285793592255791e-07, "loss": 1.9763, "num_input_tokens_seen": 474285552, "step": 465800 }, { "epoch": 9.172884959933846, "grad_norm": 2.134746551513672, "learning_rate": 3.2838568923090115e-07, "loss": 2.0322, "num_input_tokens_seen": 474387680, "step": 465900 }, { "epoch": 9.174853812683349, "grad_norm": 2.0179078578948975, "learning_rate": 3.2819204841778936e-07, "loss": 2.047, "num_input_tokens_seen": 474489416, "step": 466000 }, { "epoch": 9.176822665432852, "grad_norm": 1.8857274055480957, "learning_rate": 3.2799843681917066e-07, "loss": 1.9924, "num_input_tokens_seen": 474591816, "step": 466100 }, { "epoch": 9.178791518182354, "grad_norm": 1.9603630304336548, "learning_rate": 3.27804854467967e-07, "loss": 1.9742, "num_input_tokens_seen": 474692704, "step": 466200 }, { "epoch": 9.180760370931859, "grad_norm": 1.8099164962768555, "learning_rate": 3.276113013970955e-07, "loss": 1.9653, "num_input_tokens_seen": 474793976, "step": 466300 }, { "epoch": 9.182729223681362, "grad_norm": 2.3027429580688477, "learning_rate": 3.274177776394682e-07, "loss": 1.9498, "num_input_tokens_seen": 474895736, "step": 466400 }, { "epoch": 9.184698076430864, "grad_norm": 1.7471723556518555, "learning_rate": 3.272242832279922e-07, "loss": 1.9733, "num_input_tokens_seen": 474997872, "step": 466500 }, { "epoch": 9.186666929180367, "grad_norm": 1.8851635456085205, "learning_rate": 3.2703081819556957e-07, "loss": 1.988, "num_input_tokens_seen": 475099320, "step": 466600 }, { "epoch": 9.18863578192987, "grad_norm": 1.6516727209091187, "learning_rate": 3.268373825750974e-07, "loss": 1.9975, "num_input_tokens_seen": 475201616, "step": 466700 }, { "epoch": 9.190604634679373, "grad_norm": 1.8464986085891724, "learning_rate": 3.266439763994678e-07, "loss": 1.984, "num_input_tokens_seen": 475303720, "step": 466800 }, { "epoch": 9.192573487428875, "grad_norm": 1.9440335035324097, "learning_rate": 3.2645059970156776e-07, "loss": 2.0096, "num_input_tokens_seen": 475406120, "step": 466900 }, { "epoch": 9.194542340178378, "grad_norm": 1.971192479133606, "learning_rate": 3.262572525142797e-07, "loss": 1.9945, "num_input_tokens_seen": 475507832, "step": 467000 }, { "epoch": 9.19651119292788, "grad_norm": 2.179316759109497, "learning_rate": 3.2606393487048017e-07, "loss": 1.9935, "num_input_tokens_seen": 475608792, "step": 467100 }, { "epoch": 9.198480045677384, "grad_norm": 3.94113826751709, "learning_rate": 3.2587064680304177e-07, "loss": 1.9534, "num_input_tokens_seen": 475710400, "step": 467200 }, { "epoch": 9.200448898426886, "grad_norm": 1.9328970909118652, "learning_rate": 3.2567738834483086e-07, "loss": 1.9876, "num_input_tokens_seen": 475812800, "step": 467300 }, { "epoch": 9.20241775117639, "grad_norm": 1.877854347229004, "learning_rate": 3.254841595287101e-07, "loss": 1.9952, "num_input_tokens_seen": 475914248, "step": 467400 }, { "epoch": 9.204386603925892, "grad_norm": 1.911928415298462, "learning_rate": 3.252909603875358e-07, "loss": 2.0325, "num_input_tokens_seen": 476015624, "step": 467500 }, { "epoch": 9.206355456675395, "grad_norm": 1.9240180253982544, "learning_rate": 3.250977909541604e-07, "loss": 2.0211, "num_input_tokens_seen": 476117536, "step": 467600 }, { "epoch": 9.208324309424897, "grad_norm": 2.091864585876465, "learning_rate": 3.249046512614302e-07, "loss": 1.9722, "num_input_tokens_seen": 476219936, "step": 467700 }, { "epoch": 9.2102931621744, "grad_norm": 1.8455979824066162, "learning_rate": 3.2471154134218747e-07, "loss": 1.9958, "num_input_tokens_seen": 476321784, "step": 467800 }, { "epoch": 9.212262014923905, "grad_norm": 1.8556621074676514, "learning_rate": 3.245184612292684e-07, "loss": 1.9936, "num_input_tokens_seen": 476423248, "step": 467900 }, { "epoch": 9.214230867673407, "grad_norm": 1.7449880838394165, "learning_rate": 3.2432541095550496e-07, "loss": 1.9763, "num_input_tokens_seen": 476524256, "step": 468000 }, { "epoch": 9.21619972042291, "grad_norm": 2.0576798915863037, "learning_rate": 3.241323905537239e-07, "loss": 1.9921, "num_input_tokens_seen": 476625112, "step": 468100 }, { "epoch": 9.218168573172413, "grad_norm": 2.1071770191192627, "learning_rate": 3.2393940005674625e-07, "loss": 1.9985, "num_input_tokens_seen": 476725296, "step": 468200 }, { "epoch": 9.220137425921916, "grad_norm": 2.01064395904541, "learning_rate": 3.237464394973889e-07, "loss": 1.9309, "num_input_tokens_seen": 476827104, "step": 468300 }, { "epoch": 9.222106278671419, "grad_norm": 2.1058905124664307, "learning_rate": 3.2355350890846266e-07, "loss": 1.9663, "num_input_tokens_seen": 476929072, "step": 468400 }, { "epoch": 9.224075131420921, "grad_norm": 2.111330032348633, "learning_rate": 3.2336060832277423e-07, "loss": 1.9992, "num_input_tokens_seen": 477031472, "step": 468500 }, { "epoch": 9.226043984170424, "grad_norm": 1.7522377967834473, "learning_rate": 3.2316773777312445e-07, "loss": 1.9978, "num_input_tokens_seen": 477132880, "step": 468600 }, { "epoch": 9.228012836919927, "grad_norm": 1.8095093965530396, "learning_rate": 3.2297489729230944e-07, "loss": 1.9728, "num_input_tokens_seen": 477235280, "step": 468700 }, { "epoch": 9.22998168966943, "grad_norm": 1.850618600845337, "learning_rate": 3.2278208691311987e-07, "loss": 2.003, "num_input_tokens_seen": 477337232, "step": 468800 }, { "epoch": 9.231950542418932, "grad_norm": 2.0030128955841064, "learning_rate": 3.225893066683417e-07, "loss": 2.0079, "num_input_tokens_seen": 477438184, "step": 468900 }, { "epoch": 9.233919395168435, "grad_norm": 2.2313177585601807, "learning_rate": 3.223965565907558e-07, "loss": 1.9683, "num_input_tokens_seen": 477540120, "step": 469000 }, { "epoch": 9.235888247917938, "grad_norm": 2.0350351333618164, "learning_rate": 3.222038367131373e-07, "loss": 1.9713, "num_input_tokens_seen": 477641864, "step": 469100 }, { "epoch": 9.23785710066744, "grad_norm": 2.18227219581604, "learning_rate": 3.2201114706825687e-07, "loss": 2.0003, "num_input_tokens_seen": 477743704, "step": 469200 }, { "epoch": 9.239825953416943, "grad_norm": 1.923969030380249, "learning_rate": 3.2181848768887953e-07, "loss": 1.9419, "num_input_tokens_seen": 477846104, "step": 469300 }, { "epoch": 9.241794806166446, "grad_norm": 2.0169677734375, "learning_rate": 3.216258586077657e-07, "loss": 2.0125, "num_input_tokens_seen": 477947016, "step": 469400 }, { "epoch": 9.243763658915949, "grad_norm": 1.9119294881820679, "learning_rate": 3.2143325985766994e-07, "loss": 2.0128, "num_input_tokens_seen": 478048688, "step": 469500 }, { "epoch": 9.245732511665453, "grad_norm": 1.8002443313598633, "learning_rate": 3.2124069147134234e-07, "loss": 1.9549, "num_input_tokens_seen": 478150456, "step": 469600 }, { "epoch": 9.247701364414956, "grad_norm": 2.031088352203369, "learning_rate": 3.2104815348152743e-07, "loss": 2.0572, "num_input_tokens_seen": 478251496, "step": 469700 }, { "epoch": 9.249670217164459, "grad_norm": 1.9940271377563477, "learning_rate": 3.208556459209647e-07, "loss": 1.9898, "num_input_tokens_seen": 478353896, "step": 469800 }, { "epoch": 9.251639069913962, "grad_norm": 1.775673508644104, "learning_rate": 3.2066316882238833e-07, "loss": 1.9776, "num_input_tokens_seen": 478455536, "step": 469900 }, { "epoch": 9.253607922663464, "grad_norm": 1.9610161781311035, "learning_rate": 3.2047072221852734e-07, "loss": 1.9509, "num_input_tokens_seen": 478557312, "step": 470000 }, { "epoch": 9.255576775412967, "grad_norm": 2.22320818901062, "learning_rate": 3.2027830614210614e-07, "loss": 1.9946, "num_input_tokens_seen": 478659128, "step": 470100 }, { "epoch": 9.25754562816247, "grad_norm": 1.7737853527069092, "learning_rate": 3.2008592062584287e-07, "loss": 2.0302, "num_input_tokens_seen": 478761528, "step": 470200 }, { "epoch": 9.259514480911973, "grad_norm": 1.810951590538025, "learning_rate": 3.198935657024515e-07, "loss": 2.0035, "num_input_tokens_seen": 478863928, "step": 470300 }, { "epoch": 9.261483333661475, "grad_norm": 1.684280514717102, "learning_rate": 3.1970124140464017e-07, "loss": 1.9601, "num_input_tokens_seen": 478966328, "step": 470400 }, { "epoch": 9.263452186410978, "grad_norm": 1.8167426586151123, "learning_rate": 3.195089477651121e-07, "loss": 1.9842, "num_input_tokens_seen": 479068728, "step": 470500 }, { "epoch": 9.265421039160481, "grad_norm": 1.8957006931304932, "learning_rate": 3.19316684816565e-07, "loss": 1.9632, "num_input_tokens_seen": 479171032, "step": 470600 }, { "epoch": 9.267389891909984, "grad_norm": 1.9806638956069946, "learning_rate": 3.191244525916918e-07, "loss": 1.9479, "num_input_tokens_seen": 479273432, "step": 470700 }, { "epoch": 9.269358744659487, "grad_norm": 1.9928994178771973, "learning_rate": 3.1893225112317974e-07, "loss": 1.9992, "num_input_tokens_seen": 479374640, "step": 470800 }, { "epoch": 9.27132759740899, "grad_norm": 1.9113658666610718, "learning_rate": 3.187400804437113e-07, "loss": 1.9912, "num_input_tokens_seen": 479475344, "step": 470900 }, { "epoch": 9.273296450158492, "grad_norm": 1.908582091331482, "learning_rate": 3.1854794058596325e-07, "loss": 1.9643, "num_input_tokens_seen": 479577744, "step": 471000 }, { "epoch": 9.275265302907995, "grad_norm": 2.080770969390869, "learning_rate": 3.183558315826075e-07, "loss": 2.017, "num_input_tokens_seen": 479678816, "step": 471100 }, { "epoch": 9.277234155657498, "grad_norm": 1.9410804510116577, "learning_rate": 3.1816375346631064e-07, "loss": 1.954, "num_input_tokens_seen": 479781216, "step": 471200 }, { "epoch": 9.279203008407002, "grad_norm": 2.1430890560150146, "learning_rate": 3.179717062697337e-07, "loss": 1.9689, "num_input_tokens_seen": 479881016, "step": 471300 }, { "epoch": 9.281171861156505, "grad_norm": 1.9866336584091187, "learning_rate": 3.1777969002553287e-07, "loss": 1.9697, "num_input_tokens_seen": 479983416, "step": 471400 }, { "epoch": 9.283140713906008, "grad_norm": 1.7233686447143555, "learning_rate": 3.175877047663588e-07, "loss": 1.9627, "num_input_tokens_seen": 480085136, "step": 471500 }, { "epoch": 9.28510956665551, "grad_norm": 1.7687311172485352, "learning_rate": 3.1739575052485704e-07, "loss": 1.9896, "num_input_tokens_seen": 480187536, "step": 471600 }, { "epoch": 9.287078419405013, "grad_norm": 1.7554092407226562, "learning_rate": 3.172038273336677e-07, "loss": 1.999, "num_input_tokens_seen": 480289248, "step": 471700 }, { "epoch": 9.289047272154516, "grad_norm": 2.0111732482910156, "learning_rate": 3.1701193522542584e-07, "loss": 2.0432, "num_input_tokens_seen": 480390720, "step": 471800 }, { "epoch": 9.291016124904019, "grad_norm": 2.094632625579834, "learning_rate": 3.168200742327609e-07, "loss": 2.0247, "num_input_tokens_seen": 480489912, "step": 471900 }, { "epoch": 9.292984977653521, "grad_norm": 1.6701654195785522, "learning_rate": 3.166282443882972e-07, "loss": 1.9814, "num_input_tokens_seen": 480592312, "step": 472000 }, { "epoch": 9.294953830403024, "grad_norm": 2.229541778564453, "learning_rate": 3.164364457246542e-07, "loss": 1.9446, "num_input_tokens_seen": 480694712, "step": 472100 }, { "epoch": 9.296922683152527, "grad_norm": 1.8854907751083374, "learning_rate": 3.1624467827444514e-07, "loss": 1.9776, "num_input_tokens_seen": 480795456, "step": 472200 }, { "epoch": 9.29889153590203, "grad_norm": 1.9025156497955322, "learning_rate": 3.1605294207027896e-07, "loss": 1.9725, "num_input_tokens_seen": 480897112, "step": 472300 }, { "epoch": 9.300860388651532, "grad_norm": 1.6696339845657349, "learning_rate": 3.158612371447581e-07, "loss": 1.9986, "num_input_tokens_seen": 480999512, "step": 472400 }, { "epoch": 9.302829241401035, "grad_norm": 1.939001202583313, "learning_rate": 3.1566956353048114e-07, "loss": 1.9936, "num_input_tokens_seen": 481100288, "step": 472500 }, { "epoch": 9.304798094150538, "grad_norm": 5.727812767028809, "learning_rate": 3.154779212600399e-07, "loss": 2.0239, "num_input_tokens_seen": 481202688, "step": 472600 }, { "epoch": 9.30676694690004, "grad_norm": 1.8076066970825195, "learning_rate": 3.152863103660221e-07, "loss": 1.908, "num_input_tokens_seen": 481305088, "step": 472700 }, { "epoch": 9.308735799649543, "grad_norm": 2.0467591285705566, "learning_rate": 3.15094730881009e-07, "loss": 1.9849, "num_input_tokens_seen": 481405152, "step": 472800 }, { "epoch": 9.310704652399046, "grad_norm": 2.243654489517212, "learning_rate": 3.1490318283757757e-07, "loss": 1.9627, "num_input_tokens_seen": 481507552, "step": 472900 }, { "epoch": 9.31267350514855, "grad_norm": 1.7883583307266235, "learning_rate": 3.147116662682985e-07, "loss": 2.0307, "num_input_tokens_seen": 481609952, "step": 473000 }, { "epoch": 9.314642357898054, "grad_norm": 1.8222451210021973, "learning_rate": 3.1452018120573785e-07, "loss": 1.9673, "num_input_tokens_seen": 481711112, "step": 473100 }, { "epoch": 9.316611210647556, "grad_norm": 2.2083327770233154, "learning_rate": 3.143287276824561e-07, "loss": 1.9647, "num_input_tokens_seen": 481813512, "step": 473200 }, { "epoch": 9.318580063397059, "grad_norm": 1.7571516036987305, "learning_rate": 3.1413730573100807e-07, "loss": 2.039, "num_input_tokens_seen": 481914184, "step": 473300 }, { "epoch": 9.320548916146562, "grad_norm": 1.7453532218933105, "learning_rate": 3.1394591538394366e-07, "loss": 1.9752, "num_input_tokens_seen": 482016584, "step": 473400 }, { "epoch": 9.322517768896065, "grad_norm": 2.0322978496551514, "learning_rate": 3.1375455667380695e-07, "loss": 1.9844, "num_input_tokens_seen": 482118984, "step": 473500 }, { "epoch": 9.324486621645567, "grad_norm": 2.997708797454834, "learning_rate": 3.135632296331372e-07, "loss": 1.9718, "num_input_tokens_seen": 482221384, "step": 473600 }, { "epoch": 9.32645547439507, "grad_norm": 2.3402109146118164, "learning_rate": 3.133719342944676e-07, "loss": 2.0285, "num_input_tokens_seen": 482323344, "step": 473700 }, { "epoch": 9.328424327144573, "grad_norm": 2.0022799968719482, "learning_rate": 3.131806706903266e-07, "loss": 1.9986, "num_input_tokens_seen": 482424064, "step": 473800 }, { "epoch": 9.330393179894076, "grad_norm": 1.9793131351470947, "learning_rate": 3.129894388532367e-07, "loss": 1.9741, "num_input_tokens_seen": 482525984, "step": 473900 }, { "epoch": 9.332362032643578, "grad_norm": 1.8297178745269775, "learning_rate": 3.1279823881571545e-07, "loss": 1.989, "num_input_tokens_seen": 482627552, "step": 474000 }, { "epoch": 9.334330885393081, "grad_norm": 1.928903579711914, "learning_rate": 3.1260707061027485e-07, "loss": 1.9663, "num_input_tokens_seen": 482729952, "step": 474100 }, { "epoch": 9.336299738142584, "grad_norm": 1.9665343761444092, "learning_rate": 3.1241593426942123e-07, "loss": 1.9881, "num_input_tokens_seen": 482831120, "step": 474200 }, { "epoch": 9.338268590892087, "grad_norm": 1.8264588117599487, "learning_rate": 3.12224829825656e-07, "loss": 2.0132, "num_input_tokens_seen": 482933520, "step": 474300 }, { "epoch": 9.34023744364159, "grad_norm": 1.992934226989746, "learning_rate": 3.1203375731147455e-07, "loss": 1.9447, "num_input_tokens_seen": 483035288, "step": 474400 }, { "epoch": 9.342206296391092, "grad_norm": 1.948421597480774, "learning_rate": 3.1184271675936734e-07, "loss": 1.9601, "num_input_tokens_seen": 483136136, "step": 474500 }, { "epoch": 9.344175149140597, "grad_norm": 2.632755756378174, "learning_rate": 3.1165170820181906e-07, "loss": 1.9267, "num_input_tokens_seen": 483238536, "step": 474600 }, { "epoch": 9.3461440018901, "grad_norm": 1.4781384468078613, "learning_rate": 3.1146073167130936e-07, "loss": 2.0041, "num_input_tokens_seen": 483340120, "step": 474700 }, { "epoch": 9.348112854639602, "grad_norm": 1.8639442920684814, "learning_rate": 3.1126978720031194e-07, "loss": 2.0101, "num_input_tokens_seen": 483442240, "step": 474800 }, { "epoch": 9.350081707389105, "grad_norm": 1.8763703107833862, "learning_rate": 3.1107887482129537e-07, "loss": 2.0322, "num_input_tokens_seen": 483544640, "step": 474900 }, { "epoch": 9.352050560138608, "grad_norm": 1.7841733694076538, "learning_rate": 3.1088799456672267e-07, "loss": 1.9665, "num_input_tokens_seen": 483645528, "step": 475000 }, { "epoch": 9.35401941288811, "grad_norm": 1.8835313320159912, "learning_rate": 3.106971464690514e-07, "loss": 1.9719, "num_input_tokens_seen": 483747928, "step": 475100 }, { "epoch": 9.355988265637613, "grad_norm": 1.7878150939941406, "learning_rate": 3.105063305607339e-07, "loss": 1.9826, "num_input_tokens_seen": 483850328, "step": 475200 }, { "epoch": 9.357957118387116, "grad_norm": 1.845474362373352, "learning_rate": 3.1031554687421644e-07, "loss": 2.0131, "num_input_tokens_seen": 483951776, "step": 475300 }, { "epoch": 9.359925971136619, "grad_norm": 1.8708844184875488, "learning_rate": 3.101247954419406e-07, "loss": 2.0197, "num_input_tokens_seen": 484053648, "step": 475400 }, { "epoch": 9.361894823886121, "grad_norm": 1.9564247131347656, "learning_rate": 3.099340762963415e-07, "loss": 1.9762, "num_input_tokens_seen": 484154704, "step": 475500 }, { "epoch": 9.363863676635624, "grad_norm": 1.9573136568069458, "learning_rate": 3.0974338946985e-07, "loss": 1.9717, "num_input_tokens_seen": 484256280, "step": 475600 }, { "epoch": 9.365832529385127, "grad_norm": 2.103881597518921, "learning_rate": 3.095527349948901e-07, "loss": 1.9731, "num_input_tokens_seen": 484358008, "step": 475700 }, { "epoch": 9.36780138213463, "grad_norm": 1.5944429636001587, "learning_rate": 3.093621129038816e-07, "loss": 1.9819, "num_input_tokens_seen": 484458936, "step": 475800 }, { "epoch": 9.369770234884133, "grad_norm": 2.0732192993164062, "learning_rate": 3.091715232292376e-07, "loss": 1.9712, "num_input_tokens_seen": 484557976, "step": 475900 }, { "epoch": 9.371739087633635, "grad_norm": 1.8265455961227417, "learning_rate": 3.0898096600336676e-07, "loss": 2.0142, "num_input_tokens_seen": 484660376, "step": 476000 }, { "epoch": 9.373707940383138, "grad_norm": 1.9798468351364136, "learning_rate": 3.0879044125867133e-07, "loss": 1.949, "num_input_tokens_seen": 484762776, "step": 476100 }, { "epoch": 9.37567679313264, "grad_norm": 1.9368246793746948, "learning_rate": 3.085999490275487e-07, "loss": 1.9707, "num_input_tokens_seen": 484865176, "step": 476200 }, { "epoch": 9.377645645882145, "grad_norm": 2.264223098754883, "learning_rate": 3.0840948934239046e-07, "loss": 1.9919, "num_input_tokens_seen": 484967576, "step": 476300 }, { "epoch": 9.379614498631648, "grad_norm": 2.0374276638031006, "learning_rate": 3.0821906223558255e-07, "loss": 1.9463, "num_input_tokens_seen": 485069312, "step": 476400 }, { "epoch": 9.38158335138115, "grad_norm": 6.350859642028809, "learning_rate": 3.0802866773950563e-07, "loss": 1.9481, "num_input_tokens_seen": 485171712, "step": 476500 }, { "epoch": 9.383552204130654, "grad_norm": 2.699000835418701, "learning_rate": 3.0783830588653446e-07, "loss": 2.0063, "num_input_tokens_seen": 485274112, "step": 476600 }, { "epoch": 9.385521056880156, "grad_norm": 1.6473956108093262, "learning_rate": 3.076479767090388e-07, "loss": 1.9552, "num_input_tokens_seen": 485376512, "step": 476700 }, { "epoch": 9.38748990962966, "grad_norm": 1.934122085571289, "learning_rate": 3.074576802393822e-07, "loss": 1.964, "num_input_tokens_seen": 485478288, "step": 476800 }, { "epoch": 9.389458762379162, "grad_norm": 1.996249794960022, "learning_rate": 3.0726741650992317e-07, "loss": 1.9986, "num_input_tokens_seen": 485579840, "step": 476900 }, { "epoch": 9.391427615128665, "grad_norm": 1.9151747226715088, "learning_rate": 3.0707718555301434e-07, "loss": 1.9481, "num_input_tokens_seen": 485682240, "step": 477000 }, { "epoch": 9.393396467878167, "grad_norm": 1.7812050580978394, "learning_rate": 3.068869874010028e-07, "loss": 1.967, "num_input_tokens_seen": 485784424, "step": 477100 }, { "epoch": 9.39536532062767, "grad_norm": 1.9762742519378662, "learning_rate": 3.0669682208623047e-07, "loss": 1.9881, "num_input_tokens_seen": 485885896, "step": 477200 }, { "epoch": 9.397334173377173, "grad_norm": 1.9740040302276611, "learning_rate": 3.06506689641033e-07, "loss": 1.9643, "num_input_tokens_seen": 485987680, "step": 477300 }, { "epoch": 9.399303026126676, "grad_norm": 5.866965293884277, "learning_rate": 3.0631659009774106e-07, "loss": 2.0009, "num_input_tokens_seen": 486089264, "step": 477400 }, { "epoch": 9.401271878876178, "grad_norm": 1.9798721075057983, "learning_rate": 3.061265234886793e-07, "loss": 1.9808, "num_input_tokens_seen": 486191664, "step": 477500 }, { "epoch": 9.403240731625681, "grad_norm": 1.6924023628234863, "learning_rate": 3.059364898461671e-07, "loss": 1.9908, "num_input_tokens_seen": 486293832, "step": 477600 }, { "epoch": 9.405209584375184, "grad_norm": 1.7568439245224, "learning_rate": 3.0574648920251787e-07, "loss": 1.9616, "num_input_tokens_seen": 486395472, "step": 477700 }, { "epoch": 9.407178437124687, "grad_norm": 1.8347095251083374, "learning_rate": 3.055565215900399e-07, "loss": 1.9431, "num_input_tokens_seen": 486497872, "step": 477800 }, { "epoch": 9.40914728987419, "grad_norm": 1.718002438545227, "learning_rate": 3.053665870410353e-07, "loss": 2.0107, "num_input_tokens_seen": 486600272, "step": 477900 }, { "epoch": 9.411116142623694, "grad_norm": 2.002347230911255, "learning_rate": 3.051766855878011e-07, "loss": 1.9571, "num_input_tokens_seen": 486702672, "step": 478000 }, { "epoch": 9.413084995373197, "grad_norm": 1.8163652420043945, "learning_rate": 3.049868172626281e-07, "loss": 1.9536, "num_input_tokens_seen": 486804256, "step": 478100 }, { "epoch": 9.4150538481227, "grad_norm": 2.0375304222106934, "learning_rate": 3.047969820978019e-07, "loss": 1.9842, "num_input_tokens_seen": 486906656, "step": 478200 }, { "epoch": 9.417022700872202, "grad_norm": 2.047271966934204, "learning_rate": 3.046071801256029e-07, "loss": 2.025, "num_input_tokens_seen": 487008176, "step": 478300 }, { "epoch": 9.418991553621705, "grad_norm": 2.1189892292022705, "learning_rate": 3.044174113783045e-07, "loss": 1.9808, "num_input_tokens_seen": 487109512, "step": 478400 }, { "epoch": 9.420960406371208, "grad_norm": 2.0342628955841064, "learning_rate": 3.04227675888176e-07, "loss": 1.9879, "num_input_tokens_seen": 487211912, "step": 478500 }, { "epoch": 9.42292925912071, "grad_norm": 1.9001010656356812, "learning_rate": 3.0403797368747975e-07, "loss": 1.9978, "num_input_tokens_seen": 487314312, "step": 478600 }, { "epoch": 9.424898111870213, "grad_norm": 2.133366107940674, "learning_rate": 3.038483048084734e-07, "loss": 1.9807, "num_input_tokens_seen": 487416280, "step": 478700 }, { "epoch": 9.426866964619716, "grad_norm": 1.8415337800979614, "learning_rate": 3.0365866928340833e-07, "loss": 1.9728, "num_input_tokens_seen": 487517608, "step": 478800 }, { "epoch": 9.428835817369219, "grad_norm": 1.873547911643982, "learning_rate": 3.0346906714453056e-07, "loss": 2.0177, "num_input_tokens_seen": 487619256, "step": 478900 }, { "epoch": 9.430804670118722, "grad_norm": 1.9932811260223389, "learning_rate": 3.032794984240803e-07, "loss": 1.9777, "num_input_tokens_seen": 487721656, "step": 479000 }, { "epoch": 9.432773522868224, "grad_norm": 2.0202314853668213, "learning_rate": 3.0308996315429205e-07, "loss": 1.9758, "num_input_tokens_seen": 487824056, "step": 479100 }, { "epoch": 9.434742375617727, "grad_norm": 1.7601984739303589, "learning_rate": 3.029004613673949e-07, "loss": 1.9564, "num_input_tokens_seen": 487925704, "step": 479200 }, { "epoch": 9.43671122836723, "grad_norm": 1.6400821208953857, "learning_rate": 3.0271099309561164e-07, "loss": 1.9816, "num_input_tokens_seen": 488028104, "step": 479300 }, { "epoch": 9.438680081116733, "grad_norm": 2.037163257598877, "learning_rate": 3.0252155837116023e-07, "loss": 1.9441, "num_input_tokens_seen": 488129640, "step": 479400 }, { "epoch": 9.440648933866235, "grad_norm": 1.7901649475097656, "learning_rate": 3.023321572262521e-07, "loss": 1.9221, "num_input_tokens_seen": 488232040, "step": 479500 }, { "epoch": 9.442617786615738, "grad_norm": 1.7820320129394531, "learning_rate": 3.021427896930935e-07, "loss": 1.9781, "num_input_tokens_seen": 488334256, "step": 479600 }, { "epoch": 9.444586639365243, "grad_norm": 1.6406290531158447, "learning_rate": 3.019534558038846e-07, "loss": 2.0058, "num_input_tokens_seen": 488436656, "step": 479700 }, { "epoch": 9.446555492114745, "grad_norm": 1.8044099807739258, "learning_rate": 3.0176415559082026e-07, "loss": 1.9889, "num_input_tokens_seen": 488539056, "step": 479800 }, { "epoch": 9.448524344864248, "grad_norm": 1.8847109079360962, "learning_rate": 3.0157488908608917e-07, "loss": 2.0041, "num_input_tokens_seen": 488640640, "step": 479900 }, { "epoch": 9.450493197613751, "grad_norm": 3.0067076683044434, "learning_rate": 3.0138565632187463e-07, "loss": 2.0161, "num_input_tokens_seen": 488741480, "step": 480000 }, { "epoch": 9.452462050363254, "grad_norm": 2.0110690593719482, "learning_rate": 3.0119645733035406e-07, "loss": 1.959, "num_input_tokens_seen": 488843120, "step": 480100 }, { "epoch": 9.454430903112756, "grad_norm": 1.813224196434021, "learning_rate": 3.01007292143699e-07, "loss": 1.9811, "num_input_tokens_seen": 488944896, "step": 480200 }, { "epoch": 9.45639975586226, "grad_norm": 1.855810523033142, "learning_rate": 3.008181607940757e-07, "loss": 1.9601, "num_input_tokens_seen": 489047064, "step": 480300 }, { "epoch": 9.458368608611762, "grad_norm": 2.0644779205322266, "learning_rate": 3.00629063313644e-07, "loss": 2.0049, "num_input_tokens_seen": 489148944, "step": 480400 }, { "epoch": 9.460337461361265, "grad_norm": 2.047935962677002, "learning_rate": 3.004399997345588e-07, "loss": 1.975, "num_input_tokens_seen": 489251344, "step": 480500 }, { "epoch": 9.462306314110768, "grad_norm": 1.9466698169708252, "learning_rate": 3.0025097008896814e-07, "loss": 1.9407, "num_input_tokens_seen": 489353744, "step": 480600 }, { "epoch": 9.46427516686027, "grad_norm": 1.7519962787628174, "learning_rate": 3.000619744090156e-07, "loss": 2.0201, "num_input_tokens_seen": 489453912, "step": 480700 }, { "epoch": 9.466244019609773, "grad_norm": 2.454188108444214, "learning_rate": 2.998730127268376e-07, "loss": 1.9823, "num_input_tokens_seen": 489554936, "step": 480800 }, { "epoch": 9.468212872359276, "grad_norm": 1.8989869356155396, "learning_rate": 2.996840850745662e-07, "loss": 1.9776, "num_input_tokens_seen": 489656792, "step": 480900 }, { "epoch": 9.470181725108779, "grad_norm": 1.9428150653839111, "learning_rate": 2.994951914843262e-07, "loss": 1.9719, "num_input_tokens_seen": 489759192, "step": 481000 }, { "epoch": 9.472150577858281, "grad_norm": 2.1792547702789307, "learning_rate": 2.993063319882382e-07, "loss": 2.0045, "num_input_tokens_seen": 489860464, "step": 481100 }, { "epoch": 9.474119430607784, "grad_norm": 1.7961978912353516, "learning_rate": 2.9911750661841526e-07, "loss": 1.9754, "num_input_tokens_seen": 489962864, "step": 481200 }, { "epoch": 9.476088283357289, "grad_norm": 1.9791311025619507, "learning_rate": 2.989287154069662e-07, "loss": 1.9865, "num_input_tokens_seen": 490065056, "step": 481300 }, { "epoch": 9.478057136106791, "grad_norm": 1.8737519979476929, "learning_rate": 2.987399583859932e-07, "loss": 1.9861, "num_input_tokens_seen": 490167456, "step": 481400 }, { "epoch": 9.480025988856294, "grad_norm": 2.049654483795166, "learning_rate": 2.9855123558759275e-07, "loss": 1.9892, "num_input_tokens_seen": 490269040, "step": 481500 }, { "epoch": 9.481994841605797, "grad_norm": 2.0935235023498535, "learning_rate": 2.9836254704385567e-07, "loss": 1.9572, "num_input_tokens_seen": 490371440, "step": 481600 }, { "epoch": 9.4839636943553, "grad_norm": 1.6533153057098389, "learning_rate": 2.981738927868667e-07, "loss": 1.9963, "num_input_tokens_seen": 490473840, "step": 481700 }, { "epoch": 9.485932547104802, "grad_norm": 1.9395042657852173, "learning_rate": 2.9798527284870503e-07, "loss": 2.0024, "num_input_tokens_seen": 490573944, "step": 481800 }, { "epoch": 9.487901399854305, "grad_norm": 2.0553762912750244, "learning_rate": 2.977966872614437e-07, "loss": 2.039, "num_input_tokens_seen": 490674256, "step": 481900 }, { "epoch": 9.489870252603808, "grad_norm": 2.0024337768554688, "learning_rate": 2.976081360571504e-07, "loss": 1.9972, "num_input_tokens_seen": 490776656, "step": 482000 }, { "epoch": 9.49183910535331, "grad_norm": 1.929008960723877, "learning_rate": 2.9741961926788637e-07, "loss": 2.0485, "num_input_tokens_seen": 490878264, "step": 482100 }, { "epoch": 9.493807958102813, "grad_norm": 1.934262752532959, "learning_rate": 2.9723113692570755e-07, "loss": 1.9917, "num_input_tokens_seen": 490980664, "step": 482200 }, { "epoch": 9.495776810852316, "grad_norm": 1.8070461750030518, "learning_rate": 2.970426890626637e-07, "loss": 1.9525, "num_input_tokens_seen": 491083064, "step": 482300 }, { "epoch": 9.497745663601819, "grad_norm": 1.7284494638442993, "learning_rate": 2.9685427571079856e-07, "loss": 2.036, "num_input_tokens_seen": 491183808, "step": 482400 }, { "epoch": 9.499714516351322, "grad_norm": 1.7930779457092285, "learning_rate": 2.9666589690215066e-07, "loss": 1.969, "num_input_tokens_seen": 491286208, "step": 482500 }, { "epoch": 9.501683369100824, "grad_norm": 1.8933864831924438, "learning_rate": 2.9647755266875187e-07, "loss": 2.0328, "num_input_tokens_seen": 491387224, "step": 482600 }, { "epoch": 9.503652221850327, "grad_norm": 1.849589228630066, "learning_rate": 2.962892430426287e-07, "loss": 2.0247, "num_input_tokens_seen": 491488816, "step": 482700 }, { "epoch": 9.50562107459983, "grad_norm": 2.3360965251922607, "learning_rate": 2.9610096805580155e-07, "loss": 1.9656, "num_input_tokens_seen": 491590696, "step": 482800 }, { "epoch": 9.507589927349333, "grad_norm": 1.8918428421020508, "learning_rate": 2.9591272774028504e-07, "loss": 1.9105, "num_input_tokens_seen": 491693096, "step": 482900 }, { "epoch": 9.509558780098835, "grad_norm": 2.1820144653320312, "learning_rate": 2.9572452212808774e-07, "loss": 1.9887, "num_input_tokens_seen": 491794912, "step": 483000 }, { "epoch": 9.51152763284834, "grad_norm": 1.9347286224365234, "learning_rate": 2.9553635125121267e-07, "loss": 1.9926, "num_input_tokens_seen": 491897312, "step": 483100 }, { "epoch": 9.513496485597843, "grad_norm": 2.362489700317383, "learning_rate": 2.953482151416564e-07, "loss": 2.0144, "num_input_tokens_seen": 491999712, "step": 483200 }, { "epoch": 9.515465338347346, "grad_norm": 1.9346020221710205, "learning_rate": 2.9516011383140994e-07, "loss": 1.9935, "num_input_tokens_seen": 492102112, "step": 483300 }, { "epoch": 9.517434191096848, "grad_norm": 1.8565385341644287, "learning_rate": 2.9497204735245874e-07, "loss": 1.9682, "num_input_tokens_seen": 492204512, "step": 483400 }, { "epoch": 9.519403043846351, "grad_norm": 1.8313584327697754, "learning_rate": 2.9478401573678124e-07, "loss": 1.9826, "num_input_tokens_seen": 492306088, "step": 483500 }, { "epoch": 9.521371896595854, "grad_norm": 1.8167219161987305, "learning_rate": 2.9459601901635135e-07, "loss": 1.9787, "num_input_tokens_seen": 492408488, "step": 483600 }, { "epoch": 9.523340749345357, "grad_norm": 2.275207281112671, "learning_rate": 2.944080572231357e-07, "loss": 2.0177, "num_input_tokens_seen": 492510224, "step": 483700 }, { "epoch": 9.52530960209486, "grad_norm": 2.0315277576446533, "learning_rate": 2.942201303890961e-07, "loss": 1.9909, "num_input_tokens_seen": 492612624, "step": 483800 }, { "epoch": 9.527278454844362, "grad_norm": 1.9449855089187622, "learning_rate": 2.940322385461874e-07, "loss": 1.9944, "num_input_tokens_seen": 492714232, "step": 483900 }, { "epoch": 9.529247307593865, "grad_norm": 1.933132290840149, "learning_rate": 2.938443817263597e-07, "loss": 1.982, "num_input_tokens_seen": 492816632, "step": 484000 }, { "epoch": 9.531216160343368, "grad_norm": 2.044750928878784, "learning_rate": 2.9365655996155573e-07, "loss": 1.9994, "num_input_tokens_seen": 492919024, "step": 484100 }, { "epoch": 9.53318501309287, "grad_norm": 1.9562097787857056, "learning_rate": 2.934687732837135e-07, "loss": 1.9182, "num_input_tokens_seen": 493021424, "step": 484200 }, { "epoch": 9.535153865842373, "grad_norm": 1.870526909828186, "learning_rate": 2.9328102172476447e-07, "loss": 1.9992, "num_input_tokens_seen": 493122904, "step": 484300 }, { "epoch": 9.537122718591876, "grad_norm": 2.134713649749756, "learning_rate": 2.930933053166341e-07, "loss": 2.0263, "num_input_tokens_seen": 493224592, "step": 484400 }, { "epoch": 9.539091571341379, "grad_norm": 2.2230257987976074, "learning_rate": 2.9290562409124207e-07, "loss": 1.9926, "num_input_tokens_seen": 493326136, "step": 484500 }, { "epoch": 9.541060424090883, "grad_norm": 2.0468411445617676, "learning_rate": 2.927179780805019e-07, "loss": 1.975, "num_input_tokens_seen": 493428536, "step": 484600 }, { "epoch": 9.543029276840386, "grad_norm": 2.568345785140991, "learning_rate": 2.925303673163213e-07, "loss": 1.9885, "num_input_tokens_seen": 493530936, "step": 484700 }, { "epoch": 9.544998129589889, "grad_norm": 2.144352912902832, "learning_rate": 2.923427918306018e-07, "loss": 1.9855, "num_input_tokens_seen": 493632416, "step": 484800 }, { "epoch": 9.546966982339391, "grad_norm": 1.7890158891677856, "learning_rate": 2.9215525165523914e-07, "loss": 1.9792, "num_input_tokens_seen": 493734264, "step": 484900 }, { "epoch": 9.548935835088894, "grad_norm": 1.8331222534179688, "learning_rate": 2.919677468221229e-07, "loss": 1.989, "num_input_tokens_seen": 493836040, "step": 485000 }, { "epoch": 9.550904687838397, "grad_norm": 2.1179535388946533, "learning_rate": 2.917802773631367e-07, "loss": 1.9415, "num_input_tokens_seen": 493937768, "step": 485100 }, { "epoch": 9.5528735405879, "grad_norm": 1.8272769451141357, "learning_rate": 2.9159284331015797e-07, "loss": 2.0194, "num_input_tokens_seen": 494040168, "step": 485200 }, { "epoch": 9.554842393337402, "grad_norm": 1.80509352684021, "learning_rate": 2.914054446950586e-07, "loss": 1.9741, "num_input_tokens_seen": 494141928, "step": 485300 }, { "epoch": 9.556811246086905, "grad_norm": 1.732566237449646, "learning_rate": 2.912180815497039e-07, "loss": 2.0084, "num_input_tokens_seen": 494243072, "step": 485400 }, { "epoch": 9.558780098836408, "grad_norm": 1.8130834102630615, "learning_rate": 2.9103075390595347e-07, "loss": 2.0252, "num_input_tokens_seen": 494343600, "step": 485500 }, { "epoch": 9.56074895158591, "grad_norm": 2.0498244762420654, "learning_rate": 2.9084346179566107e-07, "loss": 1.9835, "num_input_tokens_seen": 494446000, "step": 485600 }, { "epoch": 9.562717804335414, "grad_norm": 2.3239004611968994, "learning_rate": 2.906562052506736e-07, "loss": 2.0066, "num_input_tokens_seen": 494548400, "step": 485700 }, { "epoch": 9.564686657084916, "grad_norm": 2.157916784286499, "learning_rate": 2.9046898430283286e-07, "loss": 2.0113, "num_input_tokens_seen": 494650040, "step": 485800 }, { "epoch": 9.566655509834419, "grad_norm": 2.1191813945770264, "learning_rate": 2.9028179898397397e-07, "loss": 1.9713, "num_input_tokens_seen": 494752440, "step": 485900 }, { "epoch": 9.568624362583922, "grad_norm": 1.954754114151001, "learning_rate": 2.9009464932592653e-07, "loss": 2.033, "num_input_tokens_seen": 494854840, "step": 486000 }, { "epoch": 9.570593215333425, "grad_norm": 2.3735392093658447, "learning_rate": 2.8990753536051335e-07, "loss": 2.0372, "num_input_tokens_seen": 494955728, "step": 486100 }, { "epoch": 9.572562068082927, "grad_norm": 1.8865255117416382, "learning_rate": 2.8972045711955186e-07, "loss": 1.9987, "num_input_tokens_seen": 495058128, "step": 486200 }, { "epoch": 9.57453092083243, "grad_norm": 1.7551848888397217, "learning_rate": 2.89533414634853e-07, "loss": 1.9405, "num_input_tokens_seen": 495160024, "step": 486300 }, { "epoch": 9.576499773581935, "grad_norm": 1.8886034488677979, "learning_rate": 2.8934640793822184e-07, "loss": 1.9612, "num_input_tokens_seen": 495262424, "step": 486400 }, { "epoch": 9.578468626331437, "grad_norm": 1.8521831035614014, "learning_rate": 2.891594370614575e-07, "loss": 2.0034, "num_input_tokens_seen": 495364824, "step": 486500 }, { "epoch": 9.58043747908094, "grad_norm": 1.8598312139511108, "learning_rate": 2.8897250203635227e-07, "loss": 2.0004, "num_input_tokens_seen": 495466600, "step": 486600 }, { "epoch": 9.582406331830443, "grad_norm": 1.9452985525131226, "learning_rate": 2.8878560289469323e-07, "loss": 1.9654, "num_input_tokens_seen": 495569000, "step": 486700 }, { "epoch": 9.584375184579946, "grad_norm": 2.159031629562378, "learning_rate": 2.88598739668261e-07, "loss": 1.9751, "num_input_tokens_seen": 495670232, "step": 486800 }, { "epoch": 9.586344037329448, "grad_norm": 1.8398820161819458, "learning_rate": 2.8841191238883024e-07, "loss": 1.9639, "num_input_tokens_seen": 495771984, "step": 486900 }, { "epoch": 9.588312890078951, "grad_norm": 1.9648598432540894, "learning_rate": 2.8822512108816895e-07, "loss": 1.9311, "num_input_tokens_seen": 495874384, "step": 487000 }, { "epoch": 9.590281742828454, "grad_norm": 1.8710896968841553, "learning_rate": 2.880383657980396e-07, "loss": 1.9724, "num_input_tokens_seen": 495976248, "step": 487100 }, { "epoch": 9.592250595577957, "grad_norm": 1.960744023323059, "learning_rate": 2.8785164655019837e-07, "loss": 1.9517, "num_input_tokens_seen": 496078368, "step": 487200 }, { "epoch": 9.59421944832746, "grad_norm": 1.7784029245376587, "learning_rate": 2.876649633763953e-07, "loss": 1.9817, "num_input_tokens_seen": 496180008, "step": 487300 }, { "epoch": 9.596188301076962, "grad_norm": 1.8506587743759155, "learning_rate": 2.874783163083746e-07, "loss": 2.005, "num_input_tokens_seen": 496281200, "step": 487400 }, { "epoch": 9.598157153826465, "grad_norm": 1.8617209196090698, "learning_rate": 2.872917053778732e-07, "loss": 1.9451, "num_input_tokens_seen": 496383600, "step": 487500 }, { "epoch": 9.600126006575968, "grad_norm": 1.8235269784927368, "learning_rate": 2.871051306166237e-07, "loss": 2.006, "num_input_tokens_seen": 496485952, "step": 487600 }, { "epoch": 9.60209485932547, "grad_norm": 2.2079827785491943, "learning_rate": 2.869185920563509e-07, "loss": 2.0255, "num_input_tokens_seen": 496588352, "step": 487700 }, { "epoch": 9.604063712074973, "grad_norm": 2.095198154449463, "learning_rate": 2.867320897287745e-07, "loss": 1.979, "num_input_tokens_seen": 496689888, "step": 487800 }, { "epoch": 9.606032564824476, "grad_norm": 1.7441695928573608, "learning_rate": 2.8654562366560695e-07, "loss": 1.9682, "num_input_tokens_seen": 496792288, "step": 487900 }, { "epoch": 9.60800141757398, "grad_norm": 1.9506421089172363, "learning_rate": 2.863591938985563e-07, "loss": 1.9592, "num_input_tokens_seen": 496893816, "step": 488000 }, { "epoch": 9.609970270323483, "grad_norm": 1.9684830904006958, "learning_rate": 2.8617280045932254e-07, "loss": 1.9533, "num_input_tokens_seen": 496996216, "step": 488100 }, { "epoch": 9.611939123072986, "grad_norm": 1.8145133256912231, "learning_rate": 2.859864433796008e-07, "loss": 1.9905, "num_input_tokens_seen": 497098616, "step": 488200 }, { "epoch": 9.613907975822489, "grad_norm": 1.9941847324371338, "learning_rate": 2.858001226910788e-07, "loss": 1.9753, "num_input_tokens_seen": 497200288, "step": 488300 }, { "epoch": 9.615876828571992, "grad_norm": 1.7515360116958618, "learning_rate": 2.8561383842543953e-07, "loss": 1.9834, "num_input_tokens_seen": 497302144, "step": 488400 }, { "epoch": 9.617845681321494, "grad_norm": 2.0508158206939697, "learning_rate": 2.85427590614359e-07, "loss": 2.0187, "num_input_tokens_seen": 497403072, "step": 488500 }, { "epoch": 9.619814534070997, "grad_norm": 2.0787603855133057, "learning_rate": 2.8524137928950676e-07, "loss": 1.9858, "num_input_tokens_seen": 497505472, "step": 488600 }, { "epoch": 9.6217833868205, "grad_norm": 1.88705575466156, "learning_rate": 2.8505520448254647e-07, "loss": 1.9786, "num_input_tokens_seen": 497607288, "step": 488700 }, { "epoch": 9.623752239570003, "grad_norm": 1.6387436389923096, "learning_rate": 2.848690662251359e-07, "loss": 2.017, "num_input_tokens_seen": 497708920, "step": 488800 }, { "epoch": 9.625721092319505, "grad_norm": 2.3381474018096924, "learning_rate": 2.8468296454892625e-07, "loss": 2.0175, "num_input_tokens_seen": 497811320, "step": 488900 }, { "epoch": 9.627689945069008, "grad_norm": 1.9387003183364868, "learning_rate": 2.8449689948556223e-07, "loss": 1.9561, "num_input_tokens_seen": 497913720, "step": 489000 }, { "epoch": 9.62965879781851, "grad_norm": 3.1934409141540527, "learning_rate": 2.843108710666828e-07, "loss": 1.9508, "num_input_tokens_seen": 498015576, "step": 489100 }, { "epoch": 9.631627650568014, "grad_norm": 2.009428024291992, "learning_rate": 2.841248793239205e-07, "loss": 2.0133, "num_input_tokens_seen": 498117384, "step": 489200 }, { "epoch": 9.633596503317516, "grad_norm": 2.0938000679016113, "learning_rate": 2.8393892428890176e-07, "loss": 1.9905, "num_input_tokens_seen": 498219784, "step": 489300 }, { "epoch": 9.63556535606702, "grad_norm": 1.8964952230453491, "learning_rate": 2.837530059932468e-07, "loss": 1.9921, "num_input_tokens_seen": 498321480, "step": 489400 }, { "epoch": 9.637534208816522, "grad_norm": 2.0135340690612793, "learning_rate": 2.8356712446856903e-07, "loss": 1.9707, "num_input_tokens_seen": 498423232, "step": 489500 }, { "epoch": 9.639503061566025, "grad_norm": 1.9531112909317017, "learning_rate": 2.8338127974647623e-07, "loss": 1.9575, "num_input_tokens_seen": 498525632, "step": 489600 }, { "epoch": 9.641471914315527, "grad_norm": 1.8988596200942993, "learning_rate": 2.8319547185856983e-07, "loss": 1.9986, "num_input_tokens_seen": 498627744, "step": 489700 }, { "epoch": 9.643440767065032, "grad_norm": 2.2402443885803223, "learning_rate": 2.8300970083644505e-07, "loss": 1.9953, "num_input_tokens_seen": 498729304, "step": 489800 }, { "epoch": 9.645409619814535, "grad_norm": 2.510845422744751, "learning_rate": 2.828239667116902e-07, "loss": 1.9903, "num_input_tokens_seen": 498831704, "step": 489900 }, { "epoch": 9.647378472564037, "grad_norm": 1.8515900373458862, "learning_rate": 2.82638269515888e-07, "loss": 1.9601, "num_input_tokens_seen": 498933656, "step": 490000 }, { "epoch": 9.64934732531354, "grad_norm": 1.7519137859344482, "learning_rate": 2.8245260928061485e-07, "loss": 1.9976, "num_input_tokens_seen": 499036056, "step": 490100 }, { "epoch": 9.651316178063043, "grad_norm": 2.0999104976654053, "learning_rate": 2.8226698603744084e-07, "loss": 1.9393, "num_input_tokens_seen": 499137696, "step": 490200 }, { "epoch": 9.653285030812546, "grad_norm": 1.8033839464187622, "learning_rate": 2.820813998179291e-07, "loss": 2.0059, "num_input_tokens_seen": 499240096, "step": 490300 }, { "epoch": 9.655253883562049, "grad_norm": 2.0442311763763428, "learning_rate": 2.818958506536372e-07, "loss": 2.0026, "num_input_tokens_seen": 499342496, "step": 490400 }, { "epoch": 9.657222736311551, "grad_norm": 1.8606821298599243, "learning_rate": 2.817103385761167e-07, "loss": 2.002, "num_input_tokens_seen": 499444128, "step": 490500 }, { "epoch": 9.659191589061054, "grad_norm": 1.8472729921340942, "learning_rate": 2.815248636169118e-07, "loss": 1.9577, "num_input_tokens_seen": 499546528, "step": 490600 }, { "epoch": 9.661160441810557, "grad_norm": 2.1541848182678223, "learning_rate": 2.813394258075613e-07, "loss": 1.9521, "num_input_tokens_seen": 499648928, "step": 490700 }, { "epoch": 9.66312929456006, "grad_norm": 2.046351432800293, "learning_rate": 2.811540251795968e-07, "loss": 1.9455, "num_input_tokens_seen": 499751328, "step": 490800 }, { "epoch": 9.665098147309562, "grad_norm": 1.9514741897583008, "learning_rate": 2.809686617645449e-07, "loss": 1.9988, "num_input_tokens_seen": 499853024, "step": 490900 }, { "epoch": 9.667067000059065, "grad_norm": 1.676711916923523, "learning_rate": 2.8078333559392445e-07, "loss": 2.0047, "num_input_tokens_seen": 499954528, "step": 491000 }, { "epoch": 9.669035852808568, "grad_norm": 1.87375807762146, "learning_rate": 2.805980466992488e-07, "loss": 1.9715, "num_input_tokens_seen": 500056312, "step": 491100 }, { "epoch": 9.67100470555807, "grad_norm": 1.7538889646530151, "learning_rate": 2.804127951120248e-07, "loss": 1.9975, "num_input_tokens_seen": 500158144, "step": 491200 }, { "epoch": 9.672973558307575, "grad_norm": 1.7303611040115356, "learning_rate": 2.802275808637529e-07, "loss": 1.9748, "num_input_tokens_seen": 500260544, "step": 491300 }, { "epoch": 9.674942411057078, "grad_norm": 1.888983130455017, "learning_rate": 2.800424039859275e-07, "loss": 2.0078, "num_input_tokens_seen": 500361696, "step": 491400 }, { "epoch": 9.67691126380658, "grad_norm": 1.9586635828018188, "learning_rate": 2.7985726451003575e-07, "loss": 2.0308, "num_input_tokens_seen": 500462624, "step": 491500 }, { "epoch": 9.678880116556083, "grad_norm": 1.8989180326461792, "learning_rate": 2.796721624675595e-07, "loss": 1.976, "num_input_tokens_seen": 500565024, "step": 491600 }, { "epoch": 9.680848969305586, "grad_norm": 1.726993203163147, "learning_rate": 2.7948709788997373e-07, "loss": 1.9558, "num_input_tokens_seen": 500666680, "step": 491700 }, { "epoch": 9.682817822055089, "grad_norm": 1.9682456254959106, "learning_rate": 2.7930207080874723e-07, "loss": 2.0421, "num_input_tokens_seen": 500769080, "step": 491800 }, { "epoch": 9.684786674804592, "grad_norm": 2.2453274726867676, "learning_rate": 2.791170812553419e-07, "loss": 1.9356, "num_input_tokens_seen": 500871480, "step": 491900 }, { "epoch": 9.686755527554094, "grad_norm": 2.1051816940307617, "learning_rate": 2.789321292612139e-07, "loss": 2.0088, "num_input_tokens_seen": 500972744, "step": 492000 }, { "epoch": 9.688724380303597, "grad_norm": 2.17006516456604, "learning_rate": 2.7874721485781285e-07, "loss": 1.9819, "num_input_tokens_seen": 501074712, "step": 492100 }, { "epoch": 9.6906932330531, "grad_norm": 2.107874870300293, "learning_rate": 2.785623380765818e-07, "loss": 1.9853, "num_input_tokens_seen": 501176480, "step": 492200 }, { "epoch": 9.692662085802603, "grad_norm": 2.049481153488159, "learning_rate": 2.783774989489574e-07, "loss": 1.9698, "num_input_tokens_seen": 501278880, "step": 492300 }, { "epoch": 9.694630938552105, "grad_norm": 1.8123736381530762, "learning_rate": 2.7819269750637003e-07, "loss": 2.0016, "num_input_tokens_seen": 501380888, "step": 492400 }, { "epoch": 9.696599791301608, "grad_norm": 1.734336495399475, "learning_rate": 2.780079337802436e-07, "loss": 1.9658, "num_input_tokens_seen": 501483288, "step": 492500 }, { "epoch": 9.698568644051111, "grad_norm": 2.018087387084961, "learning_rate": 2.7782320780199575e-07, "loss": 1.9907, "num_input_tokens_seen": 501585688, "step": 492600 }, { "epoch": 9.700537496800614, "grad_norm": 2.8398308753967285, "learning_rate": 2.7763851960303766e-07, "loss": 2.0221, "num_input_tokens_seen": 501687240, "step": 492700 }, { "epoch": 9.702506349550116, "grad_norm": 2.0368359088897705, "learning_rate": 2.774538692147733e-07, "loss": 1.9818, "num_input_tokens_seen": 501788744, "step": 492800 }, { "epoch": 9.70447520229962, "grad_norm": 1.855392336845398, "learning_rate": 2.7726925666860196e-07, "loss": 1.9906, "num_input_tokens_seen": 501891144, "step": 492900 }, { "epoch": 9.706444055049122, "grad_norm": 1.8716174364089966, "learning_rate": 2.7708468199591474e-07, "loss": 1.996, "num_input_tokens_seen": 501991904, "step": 493000 }, { "epoch": 9.708412907798627, "grad_norm": 2.46576189994812, "learning_rate": 2.7690014522809734e-07, "loss": 1.9955, "num_input_tokens_seen": 502094304, "step": 493100 }, { "epoch": 9.71038176054813, "grad_norm": 2.07723069190979, "learning_rate": 2.7671564639652814e-07, "loss": 1.9854, "num_input_tokens_seen": 502196704, "step": 493200 }, { "epoch": 9.712350613297632, "grad_norm": 4.248500347137451, "learning_rate": 2.7653118553258034e-07, "loss": 2.0202, "num_input_tokens_seen": 502299104, "step": 493300 }, { "epoch": 9.714319466047135, "grad_norm": 2.7718570232391357, "learning_rate": 2.763467626676196e-07, "loss": 2.0088, "num_input_tokens_seen": 502400896, "step": 493400 }, { "epoch": 9.716288318796638, "grad_norm": 2.031266212463379, "learning_rate": 2.7616237783300534e-07, "loss": 1.9739, "num_input_tokens_seen": 502502600, "step": 493500 }, { "epoch": 9.71825717154614, "grad_norm": 2.1321840286254883, "learning_rate": 2.759780310600908e-07, "loss": 1.934, "num_input_tokens_seen": 502604488, "step": 493600 }, { "epoch": 9.720226024295643, "grad_norm": 1.8724679946899414, "learning_rate": 2.757937223802226e-07, "loss": 1.9673, "num_input_tokens_seen": 502705680, "step": 493700 }, { "epoch": 9.722194877045146, "grad_norm": 1.9844439029693604, "learning_rate": 2.7560945182474114e-07, "loss": 2.0069, "num_input_tokens_seen": 502808032, "step": 493800 }, { "epoch": 9.724163729794649, "grad_norm": 2.011350393295288, "learning_rate": 2.7542521942497956e-07, "loss": 1.955, "num_input_tokens_seen": 502908816, "step": 493900 }, { "epoch": 9.726132582544151, "grad_norm": 2.1120424270629883, "learning_rate": 2.752410252122652e-07, "loss": 1.9777, "num_input_tokens_seen": 503011216, "step": 494000 }, { "epoch": 9.728101435293654, "grad_norm": 2.089890956878662, "learning_rate": 2.750568692179189e-07, "loss": 1.9646, "num_input_tokens_seen": 503113120, "step": 494100 }, { "epoch": 9.730070288043157, "grad_norm": 2.004549503326416, "learning_rate": 2.748727514732549e-07, "loss": 2.0183, "num_input_tokens_seen": 503213472, "step": 494200 }, { "epoch": 9.73203914079266, "grad_norm": 2.12077260017395, "learning_rate": 2.7468867200958045e-07, "loss": 1.9554, "num_input_tokens_seen": 503315872, "step": 494300 }, { "epoch": 9.734007993542162, "grad_norm": 1.6632493734359741, "learning_rate": 2.7450463085819704e-07, "loss": 1.9637, "num_input_tokens_seen": 503416696, "step": 494400 }, { "epoch": 9.735976846291665, "grad_norm": 1.9577703475952148, "learning_rate": 2.743206280503992e-07, "loss": 1.9732, "num_input_tokens_seen": 503519096, "step": 494500 }, { "epoch": 9.737945699041168, "grad_norm": 1.8225675821304321, "learning_rate": 2.7413666361747506e-07, "loss": 1.9506, "num_input_tokens_seen": 503621496, "step": 494600 }, { "epoch": 9.739914551790672, "grad_norm": 2.062499523162842, "learning_rate": 2.7395273759070645e-07, "loss": 1.9915, "num_input_tokens_seen": 503723896, "step": 494700 }, { "epoch": 9.741883404540175, "grad_norm": 2.1235992908477783, "learning_rate": 2.7376885000136807e-07, "loss": 2.0217, "num_input_tokens_seen": 503825136, "step": 494800 }, { "epoch": 9.743852257289678, "grad_norm": 2.125648021697998, "learning_rate": 2.735850008807287e-07, "loss": 1.9994, "num_input_tokens_seen": 503927536, "step": 494900 }, { "epoch": 9.74582111003918, "grad_norm": 1.895817756652832, "learning_rate": 2.7340119026005015e-07, "loss": 2.062, "num_input_tokens_seen": 504028856, "step": 495000 }, { "epoch": 9.747789962788683, "grad_norm": 2.0151543617248535, "learning_rate": 2.732174181705882e-07, "loss": 1.9727, "num_input_tokens_seen": 504131168, "step": 495100 }, { "epoch": 9.749758815538186, "grad_norm": 2.3339152336120605, "learning_rate": 2.7303368464359133e-07, "loss": 1.9456, "num_input_tokens_seen": 504233568, "step": 495200 }, { "epoch": 9.751727668287689, "grad_norm": 1.84795081615448, "learning_rate": 2.7284998971030205e-07, "loss": 1.9769, "num_input_tokens_seen": 504335152, "step": 495300 }, { "epoch": 9.753696521037192, "grad_norm": 1.887708306312561, "learning_rate": 2.7266633340195615e-07, "loss": 1.9694, "num_input_tokens_seen": 504436984, "step": 495400 }, { "epoch": 9.755665373786695, "grad_norm": 1.8877062797546387, "learning_rate": 2.724827157497829e-07, "loss": 1.9582, "num_input_tokens_seen": 504538504, "step": 495500 }, { "epoch": 9.757634226536197, "grad_norm": 1.9336410760879517, "learning_rate": 2.7229913678500495e-07, "loss": 2.0069, "num_input_tokens_seen": 504640904, "step": 495600 }, { "epoch": 9.7596030792857, "grad_norm": 1.885657548904419, "learning_rate": 2.72115596538838e-07, "loss": 1.9436, "num_input_tokens_seen": 504743304, "step": 495700 }, { "epoch": 9.761571932035203, "grad_norm": 1.6186270713806152, "learning_rate": 2.7193209504249205e-07, "loss": 1.9954, "num_input_tokens_seen": 504845016, "step": 495800 }, { "epoch": 9.763540784784706, "grad_norm": 1.859894871711731, "learning_rate": 2.717486323271696e-07, "loss": 1.96, "num_input_tokens_seen": 504946704, "step": 495900 }, { "epoch": 9.765509637534208, "grad_norm": 1.8920289278030396, "learning_rate": 2.715652084240673e-07, "loss": 2.009, "num_input_tokens_seen": 505047872, "step": 496000 }, { "epoch": 9.767478490283711, "grad_norm": 2.127593994140625, "learning_rate": 2.7138182336437407e-07, "loss": 1.961, "num_input_tokens_seen": 505147912, "step": 496100 }, { "epoch": 9.769447343033214, "grad_norm": 1.780908226966858, "learning_rate": 2.711984771792741e-07, "loss": 1.9419, "num_input_tokens_seen": 505250312, "step": 496200 }, { "epoch": 9.771416195782717, "grad_norm": 1.4779224395751953, "learning_rate": 2.71015169899943e-07, "loss": 1.9676, "num_input_tokens_seen": 505351496, "step": 496300 }, { "epoch": 9.77338504853222, "grad_norm": 1.8564016819000244, "learning_rate": 2.7083190155755087e-07, "loss": 2.0155, "num_input_tokens_seen": 505453896, "step": 496400 }, { "epoch": 9.775353901281724, "grad_norm": 1.9791815280914307, "learning_rate": 2.7064867218326104e-07, "loss": 1.9633, "num_input_tokens_seen": 505556296, "step": 496500 }, { "epoch": 9.777322754031227, "grad_norm": 1.8004722595214844, "learning_rate": 2.7046548180823003e-07, "loss": 1.9416, "num_input_tokens_seen": 505658696, "step": 496600 }, { "epoch": 9.77929160678073, "grad_norm": 1.880846381187439, "learning_rate": 2.702823304636081e-07, "loss": 2.0206, "num_input_tokens_seen": 505760160, "step": 496700 }, { "epoch": 9.781260459530232, "grad_norm": 1.8842183351516724, "learning_rate": 2.700992181805382e-07, "loss": 2.0011, "num_input_tokens_seen": 505862048, "step": 496800 }, { "epoch": 9.783229312279735, "grad_norm": 2.29910945892334, "learning_rate": 2.69916144990157e-07, "loss": 1.9832, "num_input_tokens_seen": 505964448, "step": 496900 }, { "epoch": 9.785198165029238, "grad_norm": 1.9658163785934448, "learning_rate": 2.6973311092359485e-07, "loss": 1.9563, "num_input_tokens_seen": 506066032, "step": 497000 }, { "epoch": 9.78716701777874, "grad_norm": 1.6949522495269775, "learning_rate": 2.695501160119752e-07, "loss": 1.9488, "num_input_tokens_seen": 506167392, "step": 497100 }, { "epoch": 9.789135870528243, "grad_norm": 1.6766189336776733, "learning_rate": 2.6936716028641445e-07, "loss": 2.0115, "num_input_tokens_seen": 506269792, "step": 497200 }, { "epoch": 9.791104723277746, "grad_norm": 1.628969669342041, "learning_rate": 2.691842437780228e-07, "loss": 1.9597, "num_input_tokens_seen": 506371384, "step": 497300 }, { "epoch": 9.793073576027249, "grad_norm": 1.7124541997909546, "learning_rate": 2.690013665179037e-07, "loss": 2.0004, "num_input_tokens_seen": 506473040, "step": 497400 }, { "epoch": 9.795042428776751, "grad_norm": 2.026693820953369, "learning_rate": 2.6881852853715394e-07, "loss": 1.9905, "num_input_tokens_seen": 506573416, "step": 497500 }, { "epoch": 9.797011281526254, "grad_norm": 1.9993776082992554, "learning_rate": 2.6863572986686365e-07, "loss": 2.0227, "num_input_tokens_seen": 506675376, "step": 497600 }, { "epoch": 9.798980134275757, "grad_norm": 1.9662845134735107, "learning_rate": 2.68452970538116e-07, "loss": 1.9679, "num_input_tokens_seen": 506776608, "step": 497700 }, { "epoch": 9.80094898702526, "grad_norm": 1.9802907705307007, "learning_rate": 2.6827025058198767e-07, "loss": 1.9893, "num_input_tokens_seen": 506877784, "step": 497800 }, { "epoch": 9.802917839774762, "grad_norm": 1.615738868713379, "learning_rate": 2.6808757002954873e-07, "loss": 1.9751, "num_input_tokens_seen": 506980184, "step": 497900 }, { "epoch": 9.804886692524267, "grad_norm": 1.8563772439956665, "learning_rate": 2.6790492891186267e-07, "loss": 1.9837, "num_input_tokens_seen": 507081840, "step": 498000 }, { "epoch": 9.80685554527377, "grad_norm": 2.0212411880493164, "learning_rate": 2.677223272599857e-07, "loss": 1.9597, "num_input_tokens_seen": 507183472, "step": 498100 }, { "epoch": 9.808824398023273, "grad_norm": 2.157810926437378, "learning_rate": 2.675397651049678e-07, "loss": 1.9883, "num_input_tokens_seen": 507285232, "step": 498200 }, { "epoch": 9.810793250772775, "grad_norm": 1.9870649576187134, "learning_rate": 2.6735724247785224e-07, "loss": 1.9834, "num_input_tokens_seen": 507387632, "step": 498300 }, { "epoch": 9.812762103522278, "grad_norm": 1.8975505828857422, "learning_rate": 2.671747594096756e-07, "loss": 1.9949, "num_input_tokens_seen": 507488704, "step": 498400 }, { "epoch": 9.81473095627178, "grad_norm": 1.7841147184371948, "learning_rate": 2.6699231593146717e-07, "loss": 1.9388, "num_input_tokens_seen": 507590536, "step": 498500 }, { "epoch": 9.816699809021284, "grad_norm": 1.925437092781067, "learning_rate": 2.668099120742499e-07, "loss": 1.9734, "num_input_tokens_seen": 507692360, "step": 498600 }, { "epoch": 9.818668661770786, "grad_norm": 2.075392723083496, "learning_rate": 2.666275478690407e-07, "loss": 2.0219, "num_input_tokens_seen": 507794112, "step": 498700 }, { "epoch": 9.820637514520289, "grad_norm": 1.7077875137329102, "learning_rate": 2.6644522334684844e-07, "loss": 2.0062, "num_input_tokens_seen": 507895664, "step": 498800 }, { "epoch": 9.822606367269792, "grad_norm": 1.8576103448867798, "learning_rate": 2.662629385386762e-07, "loss": 1.963, "num_input_tokens_seen": 507996664, "step": 498900 }, { "epoch": 9.824575220019295, "grad_norm": 1.8298090696334839, "learning_rate": 2.660806934755195e-07, "loss": 1.9652, "num_input_tokens_seen": 508099064, "step": 499000 }, { "epoch": 9.826544072768797, "grad_norm": 2.252462387084961, "learning_rate": 2.658984881883683e-07, "loss": 2.0077, "num_input_tokens_seen": 508200608, "step": 499100 }, { "epoch": 9.8285129255183, "grad_norm": 3.3195366859436035, "learning_rate": 2.6571632270820443e-07, "loss": 1.9899, "num_input_tokens_seen": 508301488, "step": 499200 }, { "epoch": 9.830481778267803, "grad_norm": 1.695478916168213, "learning_rate": 2.655341970660041e-07, "loss": 1.9432, "num_input_tokens_seen": 508403888, "step": 499300 }, { "epoch": 9.832450631017306, "grad_norm": 1.786743402481079, "learning_rate": 2.653521112927356e-07, "loss": 1.9801, "num_input_tokens_seen": 508506288, "step": 499400 }, { "epoch": 9.834419483766808, "grad_norm": 2.0381052494049072, "learning_rate": 2.651700654193616e-07, "loss": 1.9688, "num_input_tokens_seen": 508608688, "step": 499500 }, { "epoch": 9.836388336516311, "grad_norm": 1.5066784620285034, "learning_rate": 2.6498805947683756e-07, "loss": 2.0125, "num_input_tokens_seen": 508711088, "step": 499600 }, { "epoch": 9.838357189265814, "grad_norm": 1.9943665266036987, "learning_rate": 2.648060934961116e-07, "loss": 1.9359, "num_input_tokens_seen": 508813288, "step": 499700 }, { "epoch": 9.840326042015318, "grad_norm": 1.9675488471984863, "learning_rate": 2.6462416750812577e-07, "loss": 2.0064, "num_input_tokens_seen": 508915152, "step": 499800 }, { "epoch": 9.842294894764821, "grad_norm": 2.042598009109497, "learning_rate": 2.6444228154381497e-07, "loss": 1.9775, "num_input_tokens_seen": 509017552, "step": 499900 }, { "epoch": 9.844263747514324, "grad_norm": 2.2934234142303467, "learning_rate": 2.642604356341076e-07, "loss": 1.9749, "num_input_tokens_seen": 509118664, "step": 500000 }, { "epoch": 9.846232600263827, "grad_norm": 2.0251336097717285, "learning_rate": 2.6407862980992467e-07, "loss": 2.0087, "num_input_tokens_seen": 509219320, "step": 500100 }, { "epoch": 9.84820145301333, "grad_norm": 1.9895894527435303, "learning_rate": 2.638968641021808e-07, "loss": 2.0134, "num_input_tokens_seen": 509321720, "step": 500200 }, { "epoch": 9.850170305762832, "grad_norm": 3.142385482788086, "learning_rate": 2.637151385417839e-07, "loss": 1.9761, "num_input_tokens_seen": 509424120, "step": 500300 }, { "epoch": 9.852139158512335, "grad_norm": 1.9831715822219849, "learning_rate": 2.635334531596349e-07, "loss": 1.9859, "num_input_tokens_seen": 509525688, "step": 500400 }, { "epoch": 9.854108011261838, "grad_norm": 1.9075015783309937, "learning_rate": 2.633518079866276e-07, "loss": 1.9598, "num_input_tokens_seen": 509628088, "step": 500500 }, { "epoch": 9.85607686401134, "grad_norm": 1.8580267429351807, "learning_rate": 2.6317020305364936e-07, "loss": 2.0112, "num_input_tokens_seen": 509728448, "step": 500600 }, { "epoch": 9.858045716760843, "grad_norm": 1.93879234790802, "learning_rate": 2.6298863839158056e-07, "loss": 1.9739, "num_input_tokens_seen": 509830848, "step": 500700 }, { "epoch": 9.860014569510346, "grad_norm": 2.9762351512908936, "learning_rate": 2.6280711403129477e-07, "loss": 1.9801, "num_input_tokens_seen": 509933248, "step": 500800 }, { "epoch": 9.861983422259849, "grad_norm": 1.956985354423523, "learning_rate": 2.6262563000365886e-07, "loss": 2.0005, "num_input_tokens_seen": 510034840, "step": 500900 }, { "epoch": 9.863952275009352, "grad_norm": 2.363708257675171, "learning_rate": 2.6244418633953226e-07, "loss": 1.9879, "num_input_tokens_seen": 510136696, "step": 501000 }, { "epoch": 9.865921127758854, "grad_norm": 2.0405540466308594, "learning_rate": 2.6226278306976815e-07, "loss": 1.9808, "num_input_tokens_seen": 510238304, "step": 501100 }, { "epoch": 9.867889980508357, "grad_norm": 2.091843366622925, "learning_rate": 2.620814202252125e-07, "loss": 2.0128, "num_input_tokens_seen": 510340704, "step": 501200 }, { "epoch": 9.86985883325786, "grad_norm": 1.728461742401123, "learning_rate": 2.61900097836705e-07, "loss": 2.0451, "num_input_tokens_seen": 510441816, "step": 501300 }, { "epoch": 9.871827686007364, "grad_norm": 1.9842027425765991, "learning_rate": 2.617188159350772e-07, "loss": 2.0257, "num_input_tokens_seen": 510543376, "step": 501400 }, { "epoch": 9.873796538756867, "grad_norm": 1.8193837404251099, "learning_rate": 2.615375745511551e-07, "loss": 1.9786, "num_input_tokens_seen": 510645064, "step": 501500 }, { "epoch": 9.87576539150637, "grad_norm": 1.8437957763671875, "learning_rate": 2.6135637371575743e-07, "loss": 2.0041, "num_input_tokens_seen": 510746392, "step": 501600 }, { "epoch": 9.877734244255873, "grad_norm": 2.14461612701416, "learning_rate": 2.611752134596954e-07, "loss": 1.9877, "num_input_tokens_seen": 510848192, "step": 501700 }, { "epoch": 9.879703097005375, "grad_norm": 1.79839026927948, "learning_rate": 2.609940938137739e-07, "loss": 1.9885, "num_input_tokens_seen": 510950592, "step": 501800 }, { "epoch": 9.881671949754878, "grad_norm": 2.0120837688446045, "learning_rate": 2.60813014808791e-07, "loss": 1.9937, "num_input_tokens_seen": 511052992, "step": 501900 }, { "epoch": 9.883640802504381, "grad_norm": 2.0450146198272705, "learning_rate": 2.6063197647553763e-07, "loss": 2.0191, "num_input_tokens_seen": 511152664, "step": 502000 }, { "epoch": 9.885609655253884, "grad_norm": 2.009990692138672, "learning_rate": 2.604509788447976e-07, "loss": 1.9992, "num_input_tokens_seen": 511253880, "step": 502100 }, { "epoch": 9.887578508003386, "grad_norm": 2.1507298946380615, "learning_rate": 2.6027002194734813e-07, "loss": 2.0121, "num_input_tokens_seen": 511354808, "step": 502200 }, { "epoch": 9.88954736075289, "grad_norm": 1.8799818754196167, "learning_rate": 2.6008910581395946e-07, "loss": 1.9703, "num_input_tokens_seen": 511456360, "step": 502300 }, { "epoch": 9.891516213502392, "grad_norm": 2.1749606132507324, "learning_rate": 2.599082304753951e-07, "loss": 1.9316, "num_input_tokens_seen": 511557536, "step": 502400 }, { "epoch": 9.893485066251895, "grad_norm": 2.0596413612365723, "learning_rate": 2.597273959624109e-07, "loss": 2.0277, "num_input_tokens_seen": 511658456, "step": 502500 }, { "epoch": 9.895453919001397, "grad_norm": 1.997752070426941, "learning_rate": 2.595466023057564e-07, "loss": 2.0189, "num_input_tokens_seen": 511760856, "step": 502600 }, { "epoch": 9.8974227717509, "grad_norm": 8.40396499633789, "learning_rate": 2.593658495361742e-07, "loss": 1.9804, "num_input_tokens_seen": 511861480, "step": 502700 }, { "epoch": 9.899391624500403, "grad_norm": 1.9299832582473755, "learning_rate": 2.591851376843996e-07, "loss": 1.973, "num_input_tokens_seen": 511963880, "step": 502800 }, { "epoch": 9.901360477249906, "grad_norm": 1.8149622678756714, "learning_rate": 2.5900446678116145e-07, "loss": 2.0083, "num_input_tokens_seen": 512065416, "step": 502900 }, { "epoch": 9.903329329999409, "grad_norm": 1.9619429111480713, "learning_rate": 2.588238368571809e-07, "loss": 2.0159, "num_input_tokens_seen": 512166376, "step": 503000 }, { "epoch": 9.905298182748911, "grad_norm": 2.074518918991089, "learning_rate": 2.5864324794317263e-07, "loss": 1.9427, "num_input_tokens_seen": 512268224, "step": 503100 }, { "epoch": 9.907267035498416, "grad_norm": 1.9608818292617798, "learning_rate": 2.584627000698444e-07, "loss": 2.0108, "num_input_tokens_seen": 512370176, "step": 503200 }, { "epoch": 9.909235888247919, "grad_norm": 1.947589635848999, "learning_rate": 2.58282193267897e-07, "loss": 2.0182, "num_input_tokens_seen": 512470304, "step": 503300 }, { "epoch": 9.911204740997421, "grad_norm": 1.9365743398666382, "learning_rate": 2.581017275680237e-07, "loss": 2.0415, "num_input_tokens_seen": 512572704, "step": 503400 }, { "epoch": 9.913173593746924, "grad_norm": 1.9300899505615234, "learning_rate": 2.579213030009114e-07, "loss": 2.0169, "num_input_tokens_seen": 512675104, "step": 503500 }, { "epoch": 9.915142446496427, "grad_norm": 2.0466909408569336, "learning_rate": 2.577409195972397e-07, "loss": 1.995, "num_input_tokens_seen": 512776680, "step": 503600 }, { "epoch": 9.91711129924593, "grad_norm": 1.806604027748108, "learning_rate": 2.5756057738768133e-07, "loss": 2.0216, "num_input_tokens_seen": 512878600, "step": 503700 }, { "epoch": 9.919080151995432, "grad_norm": 1.7431915998458862, "learning_rate": 2.573802764029021e-07, "loss": 2.0301, "num_input_tokens_seen": 512979456, "step": 503800 }, { "epoch": 9.921049004744935, "grad_norm": 2.1425445079803467, "learning_rate": 2.572000166735602e-07, "loss": 1.9495, "num_input_tokens_seen": 513081248, "step": 503900 }, { "epoch": 9.923017857494438, "grad_norm": 1.9135055541992188, "learning_rate": 2.5701979823030796e-07, "loss": 1.9724, "num_input_tokens_seen": 513183648, "step": 504000 }, { "epoch": 9.92498671024394, "grad_norm": 1.9988261461257935, "learning_rate": 2.5683962110378935e-07, "loss": 1.9874, "num_input_tokens_seen": 513286048, "step": 504100 }, { "epoch": 9.926955562993443, "grad_norm": 1.8992489576339722, "learning_rate": 2.5665948532464254e-07, "loss": 1.9934, "num_input_tokens_seen": 513387744, "step": 504200 }, { "epoch": 9.928924415742946, "grad_norm": 1.9562246799468994, "learning_rate": 2.5647939092349746e-07, "loss": 1.9569, "num_input_tokens_seen": 513490144, "step": 504300 }, { "epoch": 9.930893268492449, "grad_norm": 1.9197726249694824, "learning_rate": 2.562993379309784e-07, "loss": 1.9535, "num_input_tokens_seen": 513592184, "step": 504400 }, { "epoch": 9.932862121241952, "grad_norm": 2.091519594192505, "learning_rate": 2.561193263777012e-07, "loss": 1.9927, "num_input_tokens_seen": 513693624, "step": 504500 }, { "epoch": 9.934830973991454, "grad_norm": 1.6819130182266235, "learning_rate": 2.5593935629427555e-07, "loss": 2.032, "num_input_tokens_seen": 513794608, "step": 504600 }, { "epoch": 9.936799826740959, "grad_norm": 2.0900344848632812, "learning_rate": 2.557594277113039e-07, "loss": 1.9867, "num_input_tokens_seen": 513896696, "step": 504700 }, { "epoch": 9.938768679490462, "grad_norm": 1.9267830848693848, "learning_rate": 2.555795406593816e-07, "loss": 1.9819, "num_input_tokens_seen": 513998472, "step": 504800 }, { "epoch": 9.940737532239964, "grad_norm": 1.9433521032333374, "learning_rate": 2.55399695169097e-07, "loss": 2.0163, "num_input_tokens_seen": 514100400, "step": 504900 }, { "epoch": 9.942706384989467, "grad_norm": 1.8832790851593018, "learning_rate": 2.55219891271031e-07, "loss": 1.9759, "num_input_tokens_seen": 514202208, "step": 505000 }, { "epoch": 9.94467523773897, "grad_norm": 1.907757043838501, "learning_rate": 2.550401289957579e-07, "loss": 1.9478, "num_input_tokens_seen": 514304608, "step": 505100 }, { "epoch": 9.946644090488473, "grad_norm": 1.7344521284103394, "learning_rate": 2.548604083738448e-07, "loss": 1.9466, "num_input_tokens_seen": 514404648, "step": 505200 }, { "epoch": 9.948612943237976, "grad_norm": 1.7976120710372925, "learning_rate": 2.546807294358518e-07, "loss": 1.9911, "num_input_tokens_seen": 514506368, "step": 505300 }, { "epoch": 9.950581795987478, "grad_norm": 2.101142406463623, "learning_rate": 2.5450109221233157e-07, "loss": 1.9833, "num_input_tokens_seen": 514608768, "step": 505400 }, { "epoch": 9.952550648736981, "grad_norm": 2.016080856323242, "learning_rate": 2.543214967338299e-07, "loss": 2.0226, "num_input_tokens_seen": 514710648, "step": 505500 }, { "epoch": 9.954519501486484, "grad_norm": 3.0059051513671875, "learning_rate": 2.5414194303088555e-07, "loss": 2.0253, "num_input_tokens_seen": 514811992, "step": 505600 }, { "epoch": 9.956488354235987, "grad_norm": 2.1360247135162354, "learning_rate": 2.5396243113403016e-07, "loss": 2.0207, "num_input_tokens_seen": 514912672, "step": 505700 }, { "epoch": 9.95845720698549, "grad_norm": 1.8347407579421997, "learning_rate": 2.537829610737883e-07, "loss": 1.9898, "num_input_tokens_seen": 515013056, "step": 505800 }, { "epoch": 9.960426059734992, "grad_norm": 1.8011956214904785, "learning_rate": 2.5360353288067714e-07, "loss": 1.9628, "num_input_tokens_seen": 515114144, "step": 505900 }, { "epoch": 9.962394912484495, "grad_norm": 1.6846932172775269, "learning_rate": 2.534241465852069e-07, "loss": 2.02, "num_input_tokens_seen": 515216456, "step": 506000 }, { "epoch": 9.964363765233998, "grad_norm": 1.8579161167144775, "learning_rate": 2.532448022178809e-07, "loss": 2.0424, "num_input_tokens_seen": 515318376, "step": 506100 }, { "epoch": 9.9663326179835, "grad_norm": 1.856040358543396, "learning_rate": 2.5306549980919525e-07, "loss": 1.9999, "num_input_tokens_seen": 515420776, "step": 506200 }, { "epoch": 9.968301470733003, "grad_norm": 1.9598783254623413, "learning_rate": 2.5288623938963833e-07, "loss": 1.9778, "num_input_tokens_seen": 515523040, "step": 506300 }, { "epoch": 9.970270323482506, "grad_norm": 2.1430556774139404, "learning_rate": 2.5270702098969223e-07, "loss": 1.9611, "num_input_tokens_seen": 515624704, "step": 506400 }, { "epoch": 9.97223917623201, "grad_norm": 1.95353364944458, "learning_rate": 2.525278446398314e-07, "loss": 2.0287, "num_input_tokens_seen": 515727104, "step": 506500 }, { "epoch": 9.974208028981513, "grad_norm": 1.8301260471343994, "learning_rate": 2.523487103705233e-07, "loss": 1.9825, "num_input_tokens_seen": 515827264, "step": 506600 }, { "epoch": 9.976176881731016, "grad_norm": 1.9590789079666138, "learning_rate": 2.521696182122285e-07, "loss": 1.9909, "num_input_tokens_seen": 515929664, "step": 506700 }, { "epoch": 9.978145734480519, "grad_norm": 2.2142257690429688, "learning_rate": 2.5199056819539935e-07, "loss": 1.9831, "num_input_tokens_seen": 516032064, "step": 506800 }, { "epoch": 9.980114587230021, "grad_norm": 1.8652536869049072, "learning_rate": 2.5181156035048267e-07, "loss": 1.9951, "num_input_tokens_seen": 516133664, "step": 506900 }, { "epoch": 9.982083439979524, "grad_norm": 1.9279258251190186, "learning_rate": 2.5163259470791665e-07, "loss": 2.0282, "num_input_tokens_seen": 516233848, "step": 507000 }, { "epoch": 9.984052292729027, "grad_norm": 1.8755311965942383, "learning_rate": 2.514536712981332e-07, "loss": 1.9461, "num_input_tokens_seen": 516336248, "step": 507100 }, { "epoch": 9.98602114547853, "grad_norm": 1.9926308393478394, "learning_rate": 2.5127479015155633e-07, "loss": 1.9632, "num_input_tokens_seen": 516438648, "step": 507200 }, { "epoch": 9.987989998228032, "grad_norm": 1.844283938407898, "learning_rate": 2.5109595129860394e-07, "loss": 2.0052, "num_input_tokens_seen": 516540552, "step": 507300 }, { "epoch": 9.989958850977535, "grad_norm": 2.0283586978912354, "learning_rate": 2.5091715476968547e-07, "loss": 1.9831, "num_input_tokens_seen": 516641440, "step": 507400 }, { "epoch": 9.991927703727038, "grad_norm": 2.089303970336914, "learning_rate": 2.5073840059520407e-07, "loss": 1.9368, "num_input_tokens_seen": 516743064, "step": 507500 }, { "epoch": 9.99389655647654, "grad_norm": 1.672130823135376, "learning_rate": 2.50559688805555e-07, "loss": 1.9597, "num_input_tokens_seen": 516845464, "step": 507600 }, { "epoch": 9.995865409226043, "grad_norm": 3.2307937145233154, "learning_rate": 2.503810194311271e-07, "loss": 1.9873, "num_input_tokens_seen": 516946080, "step": 507700 }, { "epoch": 9.997834261975546, "grad_norm": 1.9267553091049194, "learning_rate": 2.502023925023017e-07, "loss": 1.9768, "num_input_tokens_seen": 517048480, "step": 507800 }, { "epoch": 9.999803114725049, "grad_norm": 1.6313378810882568, "learning_rate": 2.5002380804945233e-07, "loss": 1.9664, "num_input_tokens_seen": 517150880, "step": 507900 }, { "epoch": 10.001771967474552, "grad_norm": 1.7744789123535156, "learning_rate": 2.4984526610294596e-07, "loss": 2.0134, "num_input_tokens_seen": 517252488, "step": 508000 }, { "epoch": 10.003740820224056, "grad_norm": 1.9814518690109253, "learning_rate": 2.496667666931423e-07, "loss": 1.9949, "num_input_tokens_seen": 517352984, "step": 508100 }, { "epoch": 10.005709672973559, "grad_norm": 1.8977552652359009, "learning_rate": 2.494883098503937e-07, "loss": 1.9413, "num_input_tokens_seen": 517454640, "step": 508200 }, { "epoch": 10.007678525723062, "grad_norm": 2.129425287246704, "learning_rate": 2.493098956050449e-07, "loss": 1.9683, "num_input_tokens_seen": 517555064, "step": 508300 }, { "epoch": 10.009647378472565, "grad_norm": 1.9466496706008911, "learning_rate": 2.491315239874339e-07, "loss": 2.0346, "num_input_tokens_seen": 517656984, "step": 508400 }, { "epoch": 10.011616231222067, "grad_norm": 1.9415394067764282, "learning_rate": 2.4895319502789143e-07, "loss": 2.0483, "num_input_tokens_seen": 517759384, "step": 508500 }, { "epoch": 10.01358508397157, "grad_norm": 2.147742986679077, "learning_rate": 2.487749087567406e-07, "loss": 1.9784, "num_input_tokens_seen": 517861280, "step": 508600 }, { "epoch": 10.015553936721073, "grad_norm": 1.732603669166565, "learning_rate": 2.4859666520429785e-07, "loss": 1.9719, "num_input_tokens_seen": 517963208, "step": 508700 }, { "epoch": 10.017522789470576, "grad_norm": 2.3705923557281494, "learning_rate": 2.4841846440087156e-07, "loss": 2.0126, "num_input_tokens_seen": 518064736, "step": 508800 }, { "epoch": 10.019491642220078, "grad_norm": 1.7529703378677368, "learning_rate": 2.482403063767635e-07, "loss": 1.9861, "num_input_tokens_seen": 518167136, "step": 508900 }, { "epoch": 10.021460494969581, "grad_norm": 2.0923120975494385, "learning_rate": 2.4806219116226787e-07, "loss": 1.9833, "num_input_tokens_seen": 518268768, "step": 509000 }, { "epoch": 10.023429347719084, "grad_norm": 1.7131842374801636, "learning_rate": 2.4788411878767185e-07, "loss": 1.9851, "num_input_tokens_seen": 518370488, "step": 509100 }, { "epoch": 10.025398200468587, "grad_norm": 1.8414028882980347, "learning_rate": 2.477060892832548e-07, "loss": 2.0302, "num_input_tokens_seen": 518472800, "step": 509200 }, { "epoch": 10.02736705321809, "grad_norm": 1.709205150604248, "learning_rate": 2.475281026792893e-07, "loss": 2.0248, "num_input_tokens_seen": 518574760, "step": 509300 }, { "epoch": 10.029335905967592, "grad_norm": 2.290510654449463, "learning_rate": 2.4735015900604056e-07, "loss": 1.9267, "num_input_tokens_seen": 518676296, "step": 509400 }, { "epoch": 10.031304758717095, "grad_norm": 2.1205391883850098, "learning_rate": 2.471722582937665e-07, "loss": 1.9654, "num_input_tokens_seen": 518778696, "step": 509500 }, { "epoch": 10.033273611466598, "grad_norm": 2.234224557876587, "learning_rate": 2.469944005727171e-07, "loss": 1.9638, "num_input_tokens_seen": 518881096, "step": 509600 }, { "epoch": 10.0352424642161, "grad_norm": 2.0411813259124756, "learning_rate": 2.468165858731361e-07, "loss": 2.0003, "num_input_tokens_seen": 518982808, "step": 509700 }, { "epoch": 10.037211316965605, "grad_norm": 2.2885854244232178, "learning_rate": 2.466388142252594e-07, "loss": 1.9397, "num_input_tokens_seen": 519085208, "step": 509800 }, { "epoch": 10.039180169715108, "grad_norm": 1.8394403457641602, "learning_rate": 2.464610856593152e-07, "loss": 1.9773, "num_input_tokens_seen": 519187608, "step": 509900 }, { "epoch": 10.04114902246461, "grad_norm": 2.230210065841675, "learning_rate": 2.4628340020552503e-07, "loss": 2.0049, "num_input_tokens_seen": 519288824, "step": 510000 }, { "epoch": 10.043117875214113, "grad_norm": 2.0309078693389893, "learning_rate": 2.4610575789410266e-07, "loss": 1.9865, "num_input_tokens_seen": 519391224, "step": 510100 }, { "epoch": 10.045086727963616, "grad_norm": 2.0254244804382324, "learning_rate": 2.4592815875525483e-07, "loss": 2.0033, "num_input_tokens_seen": 519493624, "step": 510200 }, { "epoch": 10.047055580713119, "grad_norm": 2.09907865524292, "learning_rate": 2.457506028191806e-07, "loss": 1.9956, "num_input_tokens_seen": 519594240, "step": 510300 }, { "epoch": 10.049024433462622, "grad_norm": 1.7832585573196411, "learning_rate": 2.455730901160718e-07, "loss": 1.9841, "num_input_tokens_seen": 519695984, "step": 510400 }, { "epoch": 10.050993286212124, "grad_norm": 2.0728001594543457, "learning_rate": 2.453956206761132e-07, "loss": 1.9647, "num_input_tokens_seen": 519798384, "step": 510500 }, { "epoch": 10.052962138961627, "grad_norm": 2.0551185607910156, "learning_rate": 2.45218194529482e-07, "loss": 2.0005, "num_input_tokens_seen": 519900048, "step": 510600 }, { "epoch": 10.05493099171113, "grad_norm": 13.11880874633789, "learning_rate": 2.450408117063477e-07, "loss": 1.9848, "num_input_tokens_seen": 520001808, "step": 510700 }, { "epoch": 10.056899844460633, "grad_norm": 1.873674988746643, "learning_rate": 2.44863472236873e-07, "loss": 1.9745, "num_input_tokens_seen": 520102888, "step": 510800 }, { "epoch": 10.058868697210135, "grad_norm": 1.6636666059494019, "learning_rate": 2.4468617615121285e-07, "loss": 1.9473, "num_input_tokens_seen": 520205288, "step": 510900 }, { "epoch": 10.060837549959638, "grad_norm": 1.7565938234329224, "learning_rate": 2.44508923479515e-07, "loss": 1.9238, "num_input_tokens_seen": 520307688, "step": 511000 }, { "epoch": 10.06280640270914, "grad_norm": 1.8458462953567505, "learning_rate": 2.4433171425192013e-07, "loss": 2.0178, "num_input_tokens_seen": 520408568, "step": 511100 }, { "epoch": 10.064775255458644, "grad_norm": 3.9458611011505127, "learning_rate": 2.4415454849856053e-07, "loss": 2.0106, "num_input_tokens_seen": 520510488, "step": 511200 }, { "epoch": 10.066744108208146, "grad_norm": 1.7859681844711304, "learning_rate": 2.4397742624956214e-07, "loss": 2.0193, "num_input_tokens_seen": 520612128, "step": 511300 }, { "epoch": 10.06871296095765, "grad_norm": 1.888371467590332, "learning_rate": 2.4380034753504296e-07, "loss": 1.9404, "num_input_tokens_seen": 520712912, "step": 511400 }, { "epoch": 10.070681813707154, "grad_norm": 1.90412175655365, "learning_rate": 2.43623312385114e-07, "loss": 1.9989, "num_input_tokens_seen": 520813928, "step": 511500 }, { "epoch": 10.072650666456656, "grad_norm": 2.122999668121338, "learning_rate": 2.434463208298782e-07, "loss": 1.9649, "num_input_tokens_seen": 520914072, "step": 511600 }, { "epoch": 10.07461951920616, "grad_norm": 1.72954261302948, "learning_rate": 2.432693728994315e-07, "loss": 1.9807, "num_input_tokens_seen": 521015904, "step": 511700 }, { "epoch": 10.076588371955662, "grad_norm": 1.7599509954452515, "learning_rate": 2.4309246862386295e-07, "loss": 1.9625, "num_input_tokens_seen": 521118304, "step": 511800 }, { "epoch": 10.078557224705165, "grad_norm": 2.1503989696502686, "learning_rate": 2.4291560803325304e-07, "loss": 1.9813, "num_input_tokens_seen": 521218856, "step": 511900 }, { "epoch": 10.080526077454667, "grad_norm": 2.321514129638672, "learning_rate": 2.427387911576759e-07, "loss": 2.0091, "num_input_tokens_seen": 521321256, "step": 512000 }, { "epoch": 10.08249493020417, "grad_norm": 1.913076639175415, "learning_rate": 2.42562018027197e-07, "loss": 2.0034, "num_input_tokens_seen": 521422960, "step": 512100 }, { "epoch": 10.084463782953673, "grad_norm": 2.048002243041992, "learning_rate": 2.42385288671876e-07, "loss": 2.0196, "num_input_tokens_seen": 521524184, "step": 512200 }, { "epoch": 10.086432635703176, "grad_norm": 2.110062837600708, "learning_rate": 2.4220860312176364e-07, "loss": 1.965, "num_input_tokens_seen": 521625968, "step": 512300 }, { "epoch": 10.088401488452678, "grad_norm": 1.9550901651382446, "learning_rate": 2.420319614069043e-07, "loss": 2.0187, "num_input_tokens_seen": 521727608, "step": 512400 }, { "epoch": 10.090370341202181, "grad_norm": 1.8020589351654053, "learning_rate": 2.4185536355733363e-07, "loss": 1.9888, "num_input_tokens_seen": 521829152, "step": 512500 }, { "epoch": 10.092339193951684, "grad_norm": 1.8266799449920654, "learning_rate": 2.4167880960308153e-07, "loss": 1.9639, "num_input_tokens_seen": 521931552, "step": 512600 }, { "epoch": 10.094308046701187, "grad_norm": 1.8024038076400757, "learning_rate": 2.4150229957416887e-07, "loss": 1.9477, "num_input_tokens_seen": 522033448, "step": 512700 }, { "epoch": 10.09627689945069, "grad_norm": 1.9305915832519531, "learning_rate": 2.4132583350060984e-07, "loss": 1.9932, "num_input_tokens_seen": 522134376, "step": 512800 }, { "epoch": 10.098245752200192, "grad_norm": 1.8215798139572144, "learning_rate": 2.411494114124112e-07, "loss": 1.9581, "num_input_tokens_seen": 522236776, "step": 512900 }, { "epoch": 10.100214604949695, "grad_norm": 1.7347626686096191, "learning_rate": 2.409730333395718e-07, "loss": 1.9736, "num_input_tokens_seen": 522338552, "step": 513000 }, { "epoch": 10.102183457699198, "grad_norm": 1.7242077589035034, "learning_rate": 2.407966993120835e-07, "loss": 1.9951, "num_input_tokens_seen": 522440952, "step": 513100 }, { "epoch": 10.104152310448702, "grad_norm": 1.8284276723861694, "learning_rate": 2.406204093599302e-07, "loss": 1.9675, "num_input_tokens_seen": 522542616, "step": 513200 }, { "epoch": 10.106121163198205, "grad_norm": 2.1157314777374268, "learning_rate": 2.404441635130885e-07, "loss": 1.9681, "num_input_tokens_seen": 522645016, "step": 513300 }, { "epoch": 10.108090015947708, "grad_norm": 1.990410327911377, "learning_rate": 2.4026796180152757e-07, "loss": 1.9743, "num_input_tokens_seen": 522746672, "step": 513400 }, { "epoch": 10.11005886869721, "grad_norm": 2.300504446029663, "learning_rate": 2.400918042552093e-07, "loss": 1.9734, "num_input_tokens_seen": 522849072, "step": 513500 }, { "epoch": 10.112027721446713, "grad_norm": 2.0461835861206055, "learning_rate": 2.3991569090408737e-07, "loss": 2.0136, "num_input_tokens_seen": 522949760, "step": 513600 }, { "epoch": 10.113996574196216, "grad_norm": 1.8705706596374512, "learning_rate": 2.3973962177810845e-07, "loss": 1.9863, "num_input_tokens_seen": 523051608, "step": 513700 }, { "epoch": 10.115965426945719, "grad_norm": 1.970261812210083, "learning_rate": 2.3956359690721186e-07, "loss": 1.9778, "num_input_tokens_seen": 523153568, "step": 513800 }, { "epoch": 10.117934279695222, "grad_norm": 2.207791805267334, "learning_rate": 2.393876163213289e-07, "loss": 1.9951, "num_input_tokens_seen": 523255056, "step": 513900 }, { "epoch": 10.119903132444724, "grad_norm": 2.0701262950897217, "learning_rate": 2.3921168005038384e-07, "loss": 1.9755, "num_input_tokens_seen": 523357456, "step": 514000 }, { "epoch": 10.121871985194227, "grad_norm": 1.9004077911376953, "learning_rate": 2.390357881242929e-07, "loss": 1.9992, "num_input_tokens_seen": 523459856, "step": 514100 }, { "epoch": 10.12384083794373, "grad_norm": 2.1974658966064453, "learning_rate": 2.3885994057296506e-07, "loss": 1.9695, "num_input_tokens_seen": 523561464, "step": 514200 }, { "epoch": 10.125809690693233, "grad_norm": 2.118424892425537, "learning_rate": 2.386841374263017e-07, "loss": 1.9772, "num_input_tokens_seen": 523663864, "step": 514300 }, { "epoch": 10.127778543442735, "grad_norm": 1.9299614429473877, "learning_rate": 2.3850837871419703e-07, "loss": 1.9902, "num_input_tokens_seen": 523765712, "step": 514400 }, { "epoch": 10.129747396192238, "grad_norm": 1.9739350080490112, "learning_rate": 2.3833266446653682e-07, "loss": 2.0134, "num_input_tokens_seen": 523867632, "step": 514500 }, { "epoch": 10.131716248941741, "grad_norm": 1.9896650314331055, "learning_rate": 2.381569947132e-07, "loss": 1.9597, "num_input_tokens_seen": 523969264, "step": 514600 }, { "epoch": 10.133685101691244, "grad_norm": 1.8265329599380493, "learning_rate": 2.379813694840578e-07, "loss": 1.9922, "num_input_tokens_seen": 524069256, "step": 514700 }, { "epoch": 10.135653954440748, "grad_norm": 2.0936081409454346, "learning_rate": 2.3780578880897367e-07, "loss": 1.9293, "num_input_tokens_seen": 524170704, "step": 514800 }, { "epoch": 10.137622807190251, "grad_norm": 1.9497532844543457, "learning_rate": 2.3763025271780392e-07, "loss": 1.9511, "num_input_tokens_seen": 524272904, "step": 514900 }, { "epoch": 10.139591659939754, "grad_norm": 2.0265049934387207, "learning_rate": 2.3745476124039638e-07, "loss": 1.959, "num_input_tokens_seen": 524375304, "step": 515000 }, { "epoch": 10.141560512689257, "grad_norm": 1.735066294670105, "learning_rate": 2.3727931440659267e-07, "loss": 1.9715, "num_input_tokens_seen": 524477112, "step": 515100 }, { "epoch": 10.14352936543876, "grad_norm": 2.035606622695923, "learning_rate": 2.371039122462255e-07, "loss": 1.9913, "num_input_tokens_seen": 524579296, "step": 515200 }, { "epoch": 10.145498218188262, "grad_norm": 2.1595828533172607, "learning_rate": 2.3692855478912082e-07, "loss": 1.9816, "num_input_tokens_seen": 524681208, "step": 515300 }, { "epoch": 10.147467070937765, "grad_norm": 2.4264023303985596, "learning_rate": 2.3675324206509624e-07, "loss": 1.9869, "num_input_tokens_seen": 524781424, "step": 515400 }, { "epoch": 10.149435923687268, "grad_norm": 1.8765474557876587, "learning_rate": 2.3657797410396279e-07, "loss": 2.0115, "num_input_tokens_seen": 524883008, "step": 515500 }, { "epoch": 10.15140477643677, "grad_norm": 2.0481069087982178, "learning_rate": 2.3640275093552292e-07, "loss": 1.9893, "num_input_tokens_seen": 524985040, "step": 515600 }, { "epoch": 10.153373629186273, "grad_norm": 1.9797998666763306, "learning_rate": 2.3622757258957215e-07, "loss": 1.9814, "num_input_tokens_seen": 525086168, "step": 515700 }, { "epoch": 10.155342481935776, "grad_norm": 2.477825164794922, "learning_rate": 2.3605243909589756e-07, "loss": 2.0084, "num_input_tokens_seen": 525188568, "step": 515800 }, { "epoch": 10.157311334685279, "grad_norm": 1.7219065427780151, "learning_rate": 2.3587735048427964e-07, "loss": 2.0243, "num_input_tokens_seen": 525289304, "step": 515900 }, { "epoch": 10.159280187434781, "grad_norm": 1.96244215965271, "learning_rate": 2.357023067844907e-07, "loss": 2.0297, "num_input_tokens_seen": 525389840, "step": 516000 }, { "epoch": 10.161249040184284, "grad_norm": 1.9272196292877197, "learning_rate": 2.3552730802629518e-07, "loss": 1.985, "num_input_tokens_seen": 525492240, "step": 516100 }, { "epoch": 10.163217892933787, "grad_norm": 1.8310563564300537, "learning_rate": 2.3535235423945022e-07, "loss": 2.0207, "num_input_tokens_seen": 525594056, "step": 516200 }, { "epoch": 10.16518674568329, "grad_norm": 2.5422022342681885, "learning_rate": 2.3517744545370527e-07, "loss": 1.9616, "num_input_tokens_seen": 525696456, "step": 516300 }, { "epoch": 10.167155598432792, "grad_norm": 1.7177711725234985, "learning_rate": 2.3500258169880233e-07, "loss": 1.9646, "num_input_tokens_seen": 525798416, "step": 516400 }, { "epoch": 10.169124451182297, "grad_norm": 1.9400345087051392, "learning_rate": 2.348277630044751e-07, "loss": 1.9957, "num_input_tokens_seen": 525900816, "step": 516500 }, { "epoch": 10.1710933039318, "grad_norm": 2.4387450218200684, "learning_rate": 2.3465298940045014e-07, "loss": 1.9885, "num_input_tokens_seen": 526002544, "step": 516600 }, { "epoch": 10.173062156681302, "grad_norm": 5.567256450653076, "learning_rate": 2.3447826091644635e-07, "loss": 2.0208, "num_input_tokens_seen": 526103712, "step": 516700 }, { "epoch": 10.175031009430805, "grad_norm": 2.090766668319702, "learning_rate": 2.3430357758217478e-07, "loss": 2.0158, "num_input_tokens_seen": 526206112, "step": 516800 }, { "epoch": 10.176999862180308, "grad_norm": 1.9667357206344604, "learning_rate": 2.34128939427339e-07, "loss": 1.9853, "num_input_tokens_seen": 526307128, "step": 516900 }, { "epoch": 10.17896871492981, "grad_norm": 2.358248233795166, "learning_rate": 2.339543464816344e-07, "loss": 1.994, "num_input_tokens_seen": 526407776, "step": 517000 }, { "epoch": 10.180937567679313, "grad_norm": 2.1375815868377686, "learning_rate": 2.3377979877474935e-07, "loss": 1.9619, "num_input_tokens_seen": 526510176, "step": 517100 }, { "epoch": 10.182906420428816, "grad_norm": 4.232319355010986, "learning_rate": 2.3360529633636406e-07, "loss": 1.9753, "num_input_tokens_seen": 526612576, "step": 517200 }, { "epoch": 10.184875273178319, "grad_norm": 1.7390012741088867, "learning_rate": 2.334308391961514e-07, "loss": 2.0142, "num_input_tokens_seen": 526714120, "step": 517300 }, { "epoch": 10.186844125927822, "grad_norm": 2.1147119998931885, "learning_rate": 2.3325642738377604e-07, "loss": 1.9928, "num_input_tokens_seen": 526815200, "step": 517400 }, { "epoch": 10.188812978677324, "grad_norm": 1.6188441514968872, "learning_rate": 2.3308206092889537e-07, "loss": 1.9741, "num_input_tokens_seen": 526917600, "step": 517500 }, { "epoch": 10.190781831426827, "grad_norm": 2.1635310649871826, "learning_rate": 2.3290773986115896e-07, "loss": 1.9922, "num_input_tokens_seen": 527019424, "step": 517600 }, { "epoch": 10.19275068417633, "grad_norm": 2.92665433883667, "learning_rate": 2.3273346421020884e-07, "loss": 1.9675, "num_input_tokens_seen": 527119576, "step": 517700 }, { "epoch": 10.194719536925833, "grad_norm": 2.241255521774292, "learning_rate": 2.325592340056785e-07, "loss": 1.9674, "num_input_tokens_seen": 527220704, "step": 517800 }, { "epoch": 10.196688389675336, "grad_norm": 1.9111897945404053, "learning_rate": 2.3238504927719489e-07, "loss": 1.964, "num_input_tokens_seen": 527323104, "step": 517900 }, { "epoch": 10.198657242424838, "grad_norm": 1.8900012969970703, "learning_rate": 2.3221091005437671e-07, "loss": 1.9945, "num_input_tokens_seen": 527425504, "step": 518000 }, { "epoch": 10.200626095174341, "grad_norm": 2.0185890197753906, "learning_rate": 2.3203681636683443e-07, "loss": 1.9797, "num_input_tokens_seen": 527527904, "step": 518100 }, { "epoch": 10.202594947923846, "grad_norm": 1.8185033798217773, "learning_rate": 2.318627682441714e-07, "loss": 2.0198, "num_input_tokens_seen": 527629408, "step": 518200 }, { "epoch": 10.204563800673348, "grad_norm": 2.0610711574554443, "learning_rate": 2.3168876571598305e-07, "loss": 1.9578, "num_input_tokens_seen": 527731808, "step": 518300 }, { "epoch": 10.206532653422851, "grad_norm": 1.778607964515686, "learning_rate": 2.3151480881185724e-07, "loss": 1.9384, "num_input_tokens_seen": 527834208, "step": 518400 }, { "epoch": 10.208501506172354, "grad_norm": 2.1930012702941895, "learning_rate": 2.3134089756137355e-07, "loss": 1.9443, "num_input_tokens_seen": 527936608, "step": 518500 }, { "epoch": 10.210470358921857, "grad_norm": 2.0562498569488525, "learning_rate": 2.311670319941042e-07, "loss": 1.9886, "num_input_tokens_seen": 528038208, "step": 518600 }, { "epoch": 10.21243921167136, "grad_norm": 1.7914087772369385, "learning_rate": 2.3099321213961365e-07, "loss": 1.9796, "num_input_tokens_seen": 528138552, "step": 518700 }, { "epoch": 10.214408064420862, "grad_norm": 1.8735454082489014, "learning_rate": 2.3081943802745845e-07, "loss": 1.9795, "num_input_tokens_seen": 528240952, "step": 518800 }, { "epoch": 10.216376917170365, "grad_norm": 1.8071253299713135, "learning_rate": 2.306457096871876e-07, "loss": 1.9542, "num_input_tokens_seen": 528341680, "step": 518900 }, { "epoch": 10.218345769919868, "grad_norm": 1.6748353242874146, "learning_rate": 2.3047202714834185e-07, "loss": 1.9563, "num_input_tokens_seen": 528444080, "step": 519000 }, { "epoch": 10.22031462266937, "grad_norm": 2.3489108085632324, "learning_rate": 2.302983904404545e-07, "loss": 1.9612, "num_input_tokens_seen": 528545760, "step": 519100 }, { "epoch": 10.222283475418873, "grad_norm": 1.99221932888031, "learning_rate": 2.3012479959305114e-07, "loss": 1.9761, "num_input_tokens_seen": 528648160, "step": 519200 }, { "epoch": 10.224252328168376, "grad_norm": 2.010105609893799, "learning_rate": 2.2995125463564956e-07, "loss": 1.9773, "num_input_tokens_seen": 528750560, "step": 519300 }, { "epoch": 10.226221180917879, "grad_norm": 1.7972813844680786, "learning_rate": 2.2977775559775924e-07, "loss": 1.9713, "num_input_tokens_seen": 528852960, "step": 519400 }, { "epoch": 10.228190033667381, "grad_norm": 1.8483458757400513, "learning_rate": 2.2960430250888247e-07, "loss": 1.9711, "num_input_tokens_seen": 528955360, "step": 519500 }, { "epoch": 10.230158886416884, "grad_norm": 1.818363904953003, "learning_rate": 2.294308953985134e-07, "loss": 2.0199, "num_input_tokens_seen": 529055856, "step": 519600 }, { "epoch": 10.232127739166387, "grad_norm": 2.1480441093444824, "learning_rate": 2.2925753429613863e-07, "loss": 1.9829, "num_input_tokens_seen": 529158256, "step": 519700 }, { "epoch": 10.23409659191589, "grad_norm": 1.8674441576004028, "learning_rate": 2.2908421923123644e-07, "loss": 2.0033, "num_input_tokens_seen": 529260656, "step": 519800 }, { "epoch": 10.236065444665394, "grad_norm": 2.164078950881958, "learning_rate": 2.2891095023327762e-07, "loss": 1.9559, "num_input_tokens_seen": 529363056, "step": 519900 }, { "epoch": 10.238034297414897, "grad_norm": 1.9214270114898682, "learning_rate": 2.2873772733172559e-07, "loss": 1.9941, "num_input_tokens_seen": 529465456, "step": 520000 }, { "epoch": 10.2400031501644, "grad_norm": 1.9703806638717651, "learning_rate": 2.2856455055603485e-07, "loss": 2.0323, "num_input_tokens_seen": 529567856, "step": 520100 }, { "epoch": 10.241972002913903, "grad_norm": 2.055276393890381, "learning_rate": 2.2839141993565314e-07, "loss": 1.9791, "num_input_tokens_seen": 529670256, "step": 520200 }, { "epoch": 10.243940855663405, "grad_norm": 2.2363429069519043, "learning_rate": 2.2821833550001923e-07, "loss": 1.9815, "num_input_tokens_seen": 529772656, "step": 520300 }, { "epoch": 10.245909708412908, "grad_norm": 1.9203091859817505, "learning_rate": 2.2804529727856547e-07, "loss": 1.9928, "num_input_tokens_seen": 529875056, "step": 520400 }, { "epoch": 10.24787856116241, "grad_norm": 2.432593822479248, "learning_rate": 2.278723053007149e-07, "loss": 2.0213, "num_input_tokens_seen": 529975784, "step": 520500 }, { "epoch": 10.249847413911914, "grad_norm": 2.023099899291992, "learning_rate": 2.2769935959588377e-07, "loss": 1.9674, "num_input_tokens_seen": 530077432, "step": 520600 }, { "epoch": 10.251816266661416, "grad_norm": 2.149197816848755, "learning_rate": 2.2752646019347947e-07, "loss": 1.9796, "num_input_tokens_seen": 530179832, "step": 520700 }, { "epoch": 10.253785119410919, "grad_norm": 1.8262792825698853, "learning_rate": 2.273536071229029e-07, "loss": 2.0127, "num_input_tokens_seen": 530280344, "step": 520800 }, { "epoch": 10.255753972160422, "grad_norm": 2.2032392024993896, "learning_rate": 2.271808004135456e-07, "loss": 1.9719, "num_input_tokens_seen": 530382744, "step": 520900 }, { "epoch": 10.257722824909925, "grad_norm": 1.779129147529602, "learning_rate": 2.2700804009479212e-07, "loss": 1.9448, "num_input_tokens_seen": 530485144, "step": 521000 }, { "epoch": 10.259691677659427, "grad_norm": 1.8416393995285034, "learning_rate": 2.268353261960189e-07, "loss": 1.9488, "num_input_tokens_seen": 530587544, "step": 521100 }, { "epoch": 10.26166053040893, "grad_norm": 1.9500023126602173, "learning_rate": 2.266626587465945e-07, "loss": 2.0316, "num_input_tokens_seen": 530689944, "step": 521200 }, { "epoch": 10.263629383158433, "grad_norm": 1.6354200839996338, "learning_rate": 2.2649003777587978e-07, "loss": 1.9591, "num_input_tokens_seen": 530792344, "step": 521300 }, { "epoch": 10.265598235907936, "grad_norm": 1.780234694480896, "learning_rate": 2.2631746331322697e-07, "loss": 2.0049, "num_input_tokens_seen": 530893896, "step": 521400 }, { "epoch": 10.26756708865744, "grad_norm": 2.0246331691741943, "learning_rate": 2.2614493538798124e-07, "loss": 1.9504, "num_input_tokens_seen": 530996296, "step": 521500 }, { "epoch": 10.269535941406943, "grad_norm": 2.1935033798217773, "learning_rate": 2.2597245402947946e-07, "loss": 1.9635, "num_input_tokens_seen": 531098696, "step": 521600 }, { "epoch": 10.271504794156446, "grad_norm": 1.843631625175476, "learning_rate": 2.2580001926705073e-07, "loss": 1.949, "num_input_tokens_seen": 531201096, "step": 521700 }, { "epoch": 10.273473646905948, "grad_norm": 2.0420875549316406, "learning_rate": 2.2562763113001588e-07, "loss": 2.0177, "num_input_tokens_seen": 531302584, "step": 521800 }, { "epoch": 10.275442499655451, "grad_norm": 1.9334372282028198, "learning_rate": 2.2545528964768818e-07, "loss": 2.0019, "num_input_tokens_seen": 531404984, "step": 521900 }, { "epoch": 10.277411352404954, "grad_norm": 1.931643009185791, "learning_rate": 2.2528299484937285e-07, "loss": 2.0172, "num_input_tokens_seen": 531507384, "step": 522000 }, { "epoch": 10.279380205154457, "grad_norm": 2.018916368484497, "learning_rate": 2.251107467643672e-07, "loss": 2.0293, "num_input_tokens_seen": 531608608, "step": 522100 }, { "epoch": 10.28134905790396, "grad_norm": 1.982254981994629, "learning_rate": 2.2493854542196073e-07, "loss": 1.9613, "num_input_tokens_seen": 531710160, "step": 522200 }, { "epoch": 10.283317910653462, "grad_norm": 1.9437209367752075, "learning_rate": 2.247663908514344e-07, "loss": 1.949, "num_input_tokens_seen": 531811880, "step": 522300 }, { "epoch": 10.285286763402965, "grad_norm": 2.0421812534332275, "learning_rate": 2.245942830820619e-07, "loss": 1.9901, "num_input_tokens_seen": 531913512, "step": 522400 }, { "epoch": 10.287255616152468, "grad_norm": 1.7826402187347412, "learning_rate": 2.2442222214310869e-07, "loss": 2.029, "num_input_tokens_seen": 532014576, "step": 522500 }, { "epoch": 10.28922446890197, "grad_norm": 1.862229824066162, "learning_rate": 2.2425020806383248e-07, "loss": 1.9949, "num_input_tokens_seen": 532116232, "step": 522600 }, { "epoch": 10.291193321651473, "grad_norm": 1.9002376794815063, "learning_rate": 2.2407824087348237e-07, "loss": 2.0079, "num_input_tokens_seen": 532218536, "step": 522700 }, { "epoch": 10.293162174400976, "grad_norm": 2.0391688346862793, "learning_rate": 2.239063206013002e-07, "loss": 2.0133, "num_input_tokens_seen": 532319624, "step": 522800 }, { "epoch": 10.295131027150479, "grad_norm": 1.930080533027649, "learning_rate": 2.237344472765196e-07, "loss": 1.9766, "num_input_tokens_seen": 532421472, "step": 522900 }, { "epoch": 10.297099879899982, "grad_norm": 1.7099961042404175, "learning_rate": 2.2356262092836598e-07, "loss": 1.9889, "num_input_tokens_seen": 532523048, "step": 523000 }, { "epoch": 10.299068732649484, "grad_norm": 1.8478100299835205, "learning_rate": 2.2339084158605736e-07, "loss": 2.0042, "num_input_tokens_seen": 532623536, "step": 523100 }, { "epoch": 10.301037585398989, "grad_norm": 1.9965620040893555, "learning_rate": 2.2321910927880273e-07, "loss": 1.9936, "num_input_tokens_seen": 532725936, "step": 523200 }, { "epoch": 10.303006438148492, "grad_norm": 1.9363305568695068, "learning_rate": 2.230474240358044e-07, "loss": 1.9373, "num_input_tokens_seen": 532828336, "step": 523300 }, { "epoch": 10.304975290897994, "grad_norm": 2.0018746852874756, "learning_rate": 2.2287578588625555e-07, "loss": 1.9802, "num_input_tokens_seen": 532930736, "step": 523400 }, { "epoch": 10.306944143647497, "grad_norm": 1.9788929224014282, "learning_rate": 2.227041948593421e-07, "loss": 2.0174, "num_input_tokens_seen": 533033136, "step": 523500 }, { "epoch": 10.308912996397, "grad_norm": 1.749315857887268, "learning_rate": 2.2253265098424111e-07, "loss": 1.9514, "num_input_tokens_seen": 533134480, "step": 523600 }, { "epoch": 10.310881849146503, "grad_norm": 1.8094364404678345, "learning_rate": 2.2236115429012296e-07, "loss": 1.9811, "num_input_tokens_seen": 533236880, "step": 523700 }, { "epoch": 10.312850701896005, "grad_norm": 2.026655912399292, "learning_rate": 2.2218970480614862e-07, "loss": 1.9441, "num_input_tokens_seen": 533339280, "step": 523800 }, { "epoch": 10.314819554645508, "grad_norm": 1.825201153755188, "learning_rate": 2.220183025614718e-07, "loss": 2.0012, "num_input_tokens_seen": 533440728, "step": 523900 }, { "epoch": 10.316788407395011, "grad_norm": 1.8174968957901, "learning_rate": 2.2184694758523802e-07, "loss": 1.9657, "num_input_tokens_seen": 533542312, "step": 524000 }, { "epoch": 10.318757260144514, "grad_norm": 2.0007975101470947, "learning_rate": 2.2167563990658478e-07, "loss": 2.0017, "num_input_tokens_seen": 533644224, "step": 524100 }, { "epoch": 10.320726112894016, "grad_norm": 2.3326287269592285, "learning_rate": 2.2150437955464164e-07, "loss": 1.9784, "num_input_tokens_seen": 533746104, "step": 524200 }, { "epoch": 10.32269496564352, "grad_norm": 1.8556303977966309, "learning_rate": 2.2133316655852962e-07, "loss": 2.026, "num_input_tokens_seen": 533848504, "step": 524300 }, { "epoch": 10.324663818393022, "grad_norm": 1.7767612934112549, "learning_rate": 2.2116200094736226e-07, "loss": 1.9924, "num_input_tokens_seen": 533950312, "step": 524400 }, { "epoch": 10.326632671142525, "grad_norm": 1.9030683040618896, "learning_rate": 2.209908827502448e-07, "loss": 2.0138, "num_input_tokens_seen": 534052712, "step": 524500 }, { "epoch": 10.328601523892027, "grad_norm": 1.857700228691101, "learning_rate": 2.2081981199627464e-07, "loss": 1.9975, "num_input_tokens_seen": 534154120, "step": 524600 }, { "epoch": 10.33057037664153, "grad_norm": 1.9168223142623901, "learning_rate": 2.2064878871454056e-07, "loss": 1.9657, "num_input_tokens_seen": 534256520, "step": 524700 }, { "epoch": 10.332539229391035, "grad_norm": 1.792134165763855, "learning_rate": 2.2047781293412382e-07, "loss": 1.9913, "num_input_tokens_seen": 534358920, "step": 524800 }, { "epoch": 10.334508082140538, "grad_norm": 1.745578646659851, "learning_rate": 2.2030688468409736e-07, "loss": 2.0529, "num_input_tokens_seen": 534458480, "step": 524900 }, { "epoch": 10.33647693489004, "grad_norm": 1.7430298328399658, "learning_rate": 2.2013600399352617e-07, "loss": 1.9256, "num_input_tokens_seen": 534560408, "step": 525000 }, { "epoch": 10.338445787639543, "grad_norm": 2.0049538612365723, "learning_rate": 2.1996517089146716e-07, "loss": 1.9474, "num_input_tokens_seen": 534662808, "step": 525100 }, { "epoch": 10.340414640389046, "grad_norm": 2.332439661026001, "learning_rate": 2.1979438540696872e-07, "loss": 2.0159, "num_input_tokens_seen": 534764512, "step": 525200 }, { "epoch": 10.342383493138549, "grad_norm": 1.9742377996444702, "learning_rate": 2.196236475690717e-07, "loss": 1.9986, "num_input_tokens_seen": 534866272, "step": 525300 }, { "epoch": 10.344352345888051, "grad_norm": 1.7555639743804932, "learning_rate": 2.194529574068086e-07, "loss": 1.9787, "num_input_tokens_seen": 534968584, "step": 525400 }, { "epoch": 10.346321198637554, "grad_norm": 2.2423934936523438, "learning_rate": 2.19282314949204e-07, "loss": 2.0047, "num_input_tokens_seen": 535070224, "step": 525500 }, { "epoch": 10.348290051387057, "grad_norm": 2.0216166973114014, "learning_rate": 2.1911172022527397e-07, "loss": 2.0216, "num_input_tokens_seen": 535170960, "step": 525600 }, { "epoch": 10.35025890413656, "grad_norm": 2.083752155303955, "learning_rate": 2.1894117326402673e-07, "loss": 1.9949, "num_input_tokens_seen": 535273360, "step": 525700 }, { "epoch": 10.352227756886062, "grad_norm": 1.8411338329315186, "learning_rate": 2.1877067409446241e-07, "loss": 1.964, "num_input_tokens_seen": 535374984, "step": 525800 }, { "epoch": 10.354196609635565, "grad_norm": 1.942962884902954, "learning_rate": 2.1860022274557317e-07, "loss": 2.0191, "num_input_tokens_seen": 535477384, "step": 525900 }, { "epoch": 10.356165462385068, "grad_norm": 1.9197605848312378, "learning_rate": 2.1842981924634242e-07, "loss": 1.9617, "num_input_tokens_seen": 535579088, "step": 526000 }, { "epoch": 10.35813431513457, "grad_norm": 1.7973542213439941, "learning_rate": 2.182594636257459e-07, "loss": 1.9685, "num_input_tokens_seen": 535681488, "step": 526100 }, { "epoch": 10.360103167884073, "grad_norm": 1.6107827425003052, "learning_rate": 2.1808915591275155e-07, "loss": 2.0081, "num_input_tokens_seen": 535783448, "step": 526200 }, { "epoch": 10.362072020633576, "grad_norm": 1.6605485677719116, "learning_rate": 2.1791889613631836e-07, "loss": 1.9611, "num_input_tokens_seen": 535885848, "step": 526300 }, { "epoch": 10.364040873383079, "grad_norm": 1.8685499429702759, "learning_rate": 2.1774868432539773e-07, "loss": 1.9851, "num_input_tokens_seen": 535987608, "step": 526400 }, { "epoch": 10.366009726132582, "grad_norm": 1.9300727844238281, "learning_rate": 2.1757852050893265e-07, "loss": 2.0047, "num_input_tokens_seen": 536089304, "step": 526500 }, { "epoch": 10.367978578882086, "grad_norm": 2.131390333175659, "learning_rate": 2.1740840471585837e-07, "loss": 1.9521, "num_input_tokens_seen": 536191704, "step": 526600 }, { "epoch": 10.369947431631589, "grad_norm": 2.055431365966797, "learning_rate": 2.1723833697510113e-07, "loss": 1.9706, "num_input_tokens_seen": 536293504, "step": 526700 }, { "epoch": 10.371916284381092, "grad_norm": 2.128218650817871, "learning_rate": 2.170683173155798e-07, "loss": 1.9381, "num_input_tokens_seen": 536395376, "step": 526800 }, { "epoch": 10.373885137130594, "grad_norm": 1.9211279153823853, "learning_rate": 2.1689834576620474e-07, "loss": 2.0253, "num_input_tokens_seen": 536497776, "step": 526900 }, { "epoch": 10.375853989880097, "grad_norm": 1.6804500818252563, "learning_rate": 2.167284223558782e-07, "loss": 2.0039, "num_input_tokens_seen": 536599184, "step": 527000 }, { "epoch": 10.3778228426296, "grad_norm": 1.6547131538391113, "learning_rate": 2.1655854711349442e-07, "loss": 1.9946, "num_input_tokens_seen": 536701584, "step": 527100 }, { "epoch": 10.379791695379103, "grad_norm": 2.1227502822875977, "learning_rate": 2.163887200679388e-07, "loss": 1.9438, "num_input_tokens_seen": 536803128, "step": 527200 }, { "epoch": 10.381760548128605, "grad_norm": 1.9699610471725464, "learning_rate": 2.1621894124808932e-07, "loss": 1.9879, "num_input_tokens_seen": 536905528, "step": 527300 }, { "epoch": 10.383729400878108, "grad_norm": 1.9580391645431519, "learning_rate": 2.1604921068281528e-07, "loss": 1.9822, "num_input_tokens_seen": 537007928, "step": 527400 }, { "epoch": 10.385698253627611, "grad_norm": 1.9856661558151245, "learning_rate": 2.1587952840097823e-07, "loss": 1.964, "num_input_tokens_seen": 537109896, "step": 527500 }, { "epoch": 10.387667106377114, "grad_norm": 1.7396836280822754, "learning_rate": 2.1570989443143084e-07, "loss": 2.0012, "num_input_tokens_seen": 537211504, "step": 527600 }, { "epoch": 10.389635959126617, "grad_norm": 2.057490587234497, "learning_rate": 2.1554030880301803e-07, "loss": 2.0267, "num_input_tokens_seen": 537313448, "step": 527700 }, { "epoch": 10.39160481187612, "grad_norm": 1.6570557355880737, "learning_rate": 2.1537077154457656e-07, "loss": 2.0146, "num_input_tokens_seen": 537415304, "step": 527800 }, { "epoch": 10.393573664625622, "grad_norm": 2.0204179286956787, "learning_rate": 2.152012826849348e-07, "loss": 1.9446, "num_input_tokens_seen": 537516896, "step": 527900 }, { "epoch": 10.395542517375125, "grad_norm": 1.902204990386963, "learning_rate": 2.150318422529127e-07, "loss": 1.9952, "num_input_tokens_seen": 537618984, "step": 528000 }, { "epoch": 10.397511370124628, "grad_norm": 1.7257249355316162, "learning_rate": 2.1486245027732208e-07, "loss": 1.9506, "num_input_tokens_seen": 537721384, "step": 528100 }, { "epoch": 10.399480222874132, "grad_norm": 1.6369906663894653, "learning_rate": 2.1469310678696718e-07, "loss": 1.9586, "num_input_tokens_seen": 537823104, "step": 528200 }, { "epoch": 10.401449075623635, "grad_norm": 1.8127775192260742, "learning_rate": 2.1452381181064293e-07, "loss": 2.0144, "num_input_tokens_seen": 537925504, "step": 528300 }, { "epoch": 10.403417928373138, "grad_norm": 1.918745756149292, "learning_rate": 2.1435456537713681e-07, "loss": 1.9708, "num_input_tokens_seen": 538027088, "step": 528400 }, { "epoch": 10.40538678112264, "grad_norm": 1.9066112041473389, "learning_rate": 2.1418536751522726e-07, "loss": 1.9414, "num_input_tokens_seen": 538129488, "step": 528500 }, { "epoch": 10.407355633872143, "grad_norm": 1.8209941387176514, "learning_rate": 2.1401621825368565e-07, "loss": 1.9672, "num_input_tokens_seen": 538230440, "step": 528600 }, { "epoch": 10.409324486621646, "grad_norm": 1.7263050079345703, "learning_rate": 2.138471176212739e-07, "loss": 1.9203, "num_input_tokens_seen": 538332840, "step": 528700 }, { "epoch": 10.411293339371149, "grad_norm": 1.8965415954589844, "learning_rate": 2.136780656467464e-07, "loss": 1.9704, "num_input_tokens_seen": 538433896, "step": 528800 }, { "epoch": 10.413262192120651, "grad_norm": 1.8575900793075562, "learning_rate": 2.1350906235884852e-07, "loss": 1.9656, "num_input_tokens_seen": 538534640, "step": 528900 }, { "epoch": 10.415231044870154, "grad_norm": 2.3903491497039795, "learning_rate": 2.133401077863184e-07, "loss": 1.985, "num_input_tokens_seen": 538636176, "step": 529000 }, { "epoch": 10.417199897619657, "grad_norm": 1.6923577785491943, "learning_rate": 2.131712019578853e-07, "loss": 2.019, "num_input_tokens_seen": 538738576, "step": 529100 }, { "epoch": 10.41916875036916, "grad_norm": 4.005575180053711, "learning_rate": 2.1300234490226992e-07, "loss": 2.0357, "num_input_tokens_seen": 538839320, "step": 529200 }, { "epoch": 10.421137603118662, "grad_norm": 2.0925350189208984, "learning_rate": 2.1283353664818516e-07, "loss": 2.0347, "num_input_tokens_seen": 538941720, "step": 529300 }, { "epoch": 10.423106455868165, "grad_norm": 1.9640816450119019, "learning_rate": 2.126647772243354e-07, "loss": 2.0019, "num_input_tokens_seen": 539044120, "step": 529400 }, { "epoch": 10.425075308617668, "grad_norm": 1.8902928829193115, "learning_rate": 2.12496066659417e-07, "loss": 1.9818, "num_input_tokens_seen": 539146520, "step": 529500 }, { "epoch": 10.42704416136717, "grad_norm": 1.8575787544250488, "learning_rate": 2.1232740498211731e-07, "loss": 2.0276, "num_input_tokens_seen": 539248232, "step": 529600 }, { "epoch": 10.429013014116673, "grad_norm": 2.171523094177246, "learning_rate": 2.121587922211161e-07, "loss": 2.0075, "num_input_tokens_seen": 539350632, "step": 529700 }, { "epoch": 10.430981866866176, "grad_norm": 2.09633731842041, "learning_rate": 2.1199022840508452e-07, "loss": 1.9499, "num_input_tokens_seen": 539451976, "step": 529800 }, { "epoch": 10.43295071961568, "grad_norm": 2.0456607341766357, "learning_rate": 2.1182171356268552e-07, "loss": 1.9291, "num_input_tokens_seen": 539553840, "step": 529900 }, { "epoch": 10.434919572365184, "grad_norm": 1.7530344724655151, "learning_rate": 2.116532477225734e-07, "loss": 2.0332, "num_input_tokens_seen": 539655664, "step": 530000 }, { "epoch": 10.436888425114686, "grad_norm": 1.7530919313430786, "learning_rate": 2.1148483091339447e-07, "loss": 2.0249, "num_input_tokens_seen": 539757336, "step": 530100 }, { "epoch": 10.438857277864189, "grad_norm": 1.8370981216430664, "learning_rate": 2.1131646316378655e-07, "loss": 1.9842, "num_input_tokens_seen": 539858816, "step": 530200 }, { "epoch": 10.440826130613692, "grad_norm": 1.7831873893737793, "learning_rate": 2.111481445023793e-07, "loss": 1.9736, "num_input_tokens_seen": 539958856, "step": 530300 }, { "epoch": 10.442794983363195, "grad_norm": 1.8758455514907837, "learning_rate": 2.1097987495779385e-07, "loss": 1.9289, "num_input_tokens_seen": 540060736, "step": 530400 }, { "epoch": 10.444763836112697, "grad_norm": 1.8488799333572388, "learning_rate": 2.108116545586428e-07, "loss": 2.0148, "num_input_tokens_seen": 540162928, "step": 530500 }, { "epoch": 10.4467326888622, "grad_norm": 2.0831823348999023, "learning_rate": 2.1064348333353084e-07, "loss": 1.9506, "num_input_tokens_seen": 540265328, "step": 530600 }, { "epoch": 10.448701541611703, "grad_norm": 1.8798476457595825, "learning_rate": 2.1047536131105392e-07, "loss": 1.9451, "num_input_tokens_seen": 540367728, "step": 530700 }, { "epoch": 10.450670394361206, "grad_norm": 1.7741024494171143, "learning_rate": 2.1030728851980008e-07, "loss": 2.0824, "num_input_tokens_seen": 540469616, "step": 530800 }, { "epoch": 10.452639247110708, "grad_norm": 1.8567248582839966, "learning_rate": 2.1013926498834822e-07, "loss": 1.9891, "num_input_tokens_seen": 540570664, "step": 530900 }, { "epoch": 10.454608099860211, "grad_norm": 1.8395527601242065, "learning_rate": 2.0997129074526936e-07, "loss": 1.9399, "num_input_tokens_seen": 540672496, "step": 531000 }, { "epoch": 10.456576952609714, "grad_norm": 1.7833343744277954, "learning_rate": 2.0980336581912672e-07, "loss": 1.9804, "num_input_tokens_seen": 540774072, "step": 531100 }, { "epoch": 10.458545805359217, "grad_norm": 1.900362253189087, "learning_rate": 2.0963549023847383e-07, "loss": 1.9774, "num_input_tokens_seen": 540876472, "step": 531200 }, { "epoch": 10.46051465810872, "grad_norm": 2.158243179321289, "learning_rate": 2.09467664031857e-07, "loss": 2.0, "num_input_tokens_seen": 540978000, "step": 531300 }, { "epoch": 10.462483510858222, "grad_norm": 1.7789075374603271, "learning_rate": 2.092998872278131e-07, "loss": 1.968, "num_input_tokens_seen": 541080400, "step": 531400 }, { "epoch": 10.464452363607727, "grad_norm": 2.066131830215454, "learning_rate": 2.091321598548718e-07, "loss": 1.9518, "num_input_tokens_seen": 541181352, "step": 531500 }, { "epoch": 10.46642121635723, "grad_norm": 1.716525912284851, "learning_rate": 2.0896448194155332e-07, "loss": 1.97, "num_input_tokens_seen": 541283752, "step": 531600 }, { "epoch": 10.468390069106732, "grad_norm": 1.8623192310333252, "learning_rate": 2.0879685351637011e-07, "loss": 1.9683, "num_input_tokens_seen": 541384760, "step": 531700 }, { "epoch": 10.470358921856235, "grad_norm": 1.951213002204895, "learning_rate": 2.0862927460782547e-07, "loss": 1.9893, "num_input_tokens_seen": 541487160, "step": 531800 }, { "epoch": 10.472327774605738, "grad_norm": 2.0958969593048096, "learning_rate": 2.0846174524441557e-07, "loss": 1.9839, "num_input_tokens_seen": 541588088, "step": 531900 }, { "epoch": 10.47429662735524, "grad_norm": 1.9766016006469727, "learning_rate": 2.082942654546267e-07, "loss": 1.9573, "num_input_tokens_seen": 541690488, "step": 532000 }, { "epoch": 10.476265480104743, "grad_norm": 2.235281229019165, "learning_rate": 2.0812683526693765e-07, "loss": 1.9708, "num_input_tokens_seen": 541792056, "step": 532100 }, { "epoch": 10.478234332854246, "grad_norm": 1.8524541854858398, "learning_rate": 2.0795945470981853e-07, "loss": 2.0323, "num_input_tokens_seen": 541893760, "step": 532200 }, { "epoch": 10.480203185603749, "grad_norm": 1.9473456144332886, "learning_rate": 2.077921238117309e-07, "loss": 2.0381, "num_input_tokens_seen": 541995800, "step": 532300 }, { "epoch": 10.482172038353252, "grad_norm": 1.7756537199020386, "learning_rate": 2.0762484260112824e-07, "loss": 1.9719, "num_input_tokens_seen": 542098200, "step": 532400 }, { "epoch": 10.484140891102754, "grad_norm": 1.9330987930297852, "learning_rate": 2.0745761110645483e-07, "loss": 2.0419, "num_input_tokens_seen": 542199584, "step": 532500 }, { "epoch": 10.486109743852257, "grad_norm": 2.0194287300109863, "learning_rate": 2.0729042935614733e-07, "loss": 2.027, "num_input_tokens_seen": 542301064, "step": 532600 }, { "epoch": 10.48807859660176, "grad_norm": 2.013430118560791, "learning_rate": 2.0712329737863343e-07, "loss": 1.967, "num_input_tokens_seen": 542403464, "step": 532700 }, { "epoch": 10.490047449351263, "grad_norm": 1.8868472576141357, "learning_rate": 2.0695621520233275e-07, "loss": 1.9957, "num_input_tokens_seen": 542505864, "step": 532800 }, { "epoch": 10.492016302100765, "grad_norm": 1.6957166194915771, "learning_rate": 2.0678918285565589e-07, "loss": 2.0094, "num_input_tokens_seen": 542606008, "step": 532900 }, { "epoch": 10.493985154850268, "grad_norm": 3.040231227874756, "learning_rate": 2.0662220036700545e-07, "loss": 1.998, "num_input_tokens_seen": 542707056, "step": 533000 }, { "epoch": 10.49595400759977, "grad_norm": 2.069591999053955, "learning_rate": 2.0645526776477535e-07, "loss": 2.0218, "num_input_tokens_seen": 542808688, "step": 533100 }, { "epoch": 10.497922860349274, "grad_norm": 2.020448923110962, "learning_rate": 2.0628838507735107e-07, "loss": 2.0525, "num_input_tokens_seen": 542910160, "step": 533200 }, { "epoch": 10.499891713098778, "grad_norm": 2.019073724746704, "learning_rate": 2.061215523331098e-07, "loss": 2.0243, "num_input_tokens_seen": 543010928, "step": 533300 }, { "epoch": 10.50186056584828, "grad_norm": 2.085312604904175, "learning_rate": 2.0595476956041974e-07, "loss": 2.0143, "num_input_tokens_seen": 543112216, "step": 533400 }, { "epoch": 10.503829418597784, "grad_norm": 1.8515269756317139, "learning_rate": 2.05788036787641e-07, "loss": 1.985, "num_input_tokens_seen": 543213808, "step": 533500 }, { "epoch": 10.505798271347286, "grad_norm": 1.855665683746338, "learning_rate": 2.056213540431251e-07, "loss": 1.9961, "num_input_tokens_seen": 543315576, "step": 533600 }, { "epoch": 10.50776712409679, "grad_norm": 1.9936466217041016, "learning_rate": 2.0545472135521523e-07, "loss": 2.0058, "num_input_tokens_seen": 543417976, "step": 533700 }, { "epoch": 10.509735976846292, "grad_norm": 1.6375447511672974, "learning_rate": 2.052881387522455e-07, "loss": 2.017, "num_input_tokens_seen": 543519456, "step": 533800 }, { "epoch": 10.511704829595795, "grad_norm": 2.0896337032318115, "learning_rate": 2.0512160626254205e-07, "loss": 1.9796, "num_input_tokens_seen": 543621368, "step": 533900 }, { "epoch": 10.513673682345297, "grad_norm": 1.7808884382247925, "learning_rate": 2.0495512391442238e-07, "loss": 2.0095, "num_input_tokens_seen": 543723768, "step": 534000 }, { "epoch": 10.5156425350948, "grad_norm": 1.9391080141067505, "learning_rate": 2.0478869173619534e-07, "loss": 1.962, "num_input_tokens_seen": 543826168, "step": 534100 }, { "epoch": 10.517611387844303, "grad_norm": 2.5441837310791016, "learning_rate": 2.046223097561615e-07, "loss": 1.9686, "num_input_tokens_seen": 543928016, "step": 534200 }, { "epoch": 10.519580240593806, "grad_norm": 1.7621675729751587, "learning_rate": 2.044559780026122e-07, "loss": 2.0071, "num_input_tokens_seen": 544029240, "step": 534300 }, { "epoch": 10.521549093343308, "grad_norm": 2.0743985176086426, "learning_rate": 2.042896965038315e-07, "loss": 1.9813, "num_input_tokens_seen": 544130888, "step": 534400 }, { "epoch": 10.523517946092811, "grad_norm": 1.8300565481185913, "learning_rate": 2.0412346528809353e-07, "loss": 1.9375, "num_input_tokens_seen": 544233288, "step": 534500 }, { "epoch": 10.525486798842314, "grad_norm": 1.6652699708938599, "learning_rate": 2.0395728438366482e-07, "loss": 2.0245, "num_input_tokens_seen": 544332856, "step": 534600 }, { "epoch": 10.527455651591817, "grad_norm": 1.6961266994476318, "learning_rate": 2.0379115381880295e-07, "loss": 1.9959, "num_input_tokens_seen": 544434272, "step": 534700 }, { "epoch": 10.52942450434132, "grad_norm": 1.836819052696228, "learning_rate": 2.0362507362175718e-07, "loss": 2.0373, "num_input_tokens_seen": 544533528, "step": 534800 }, { "epoch": 10.531393357090824, "grad_norm": 1.9433895349502563, "learning_rate": 2.0345904382076772e-07, "loss": 2.0041, "num_input_tokens_seen": 544635120, "step": 534900 }, { "epoch": 10.533362209840327, "grad_norm": 1.8199739456176758, "learning_rate": 2.032930644440668e-07, "loss": 1.9818, "num_input_tokens_seen": 544737520, "step": 535000 }, { "epoch": 10.53533106258983, "grad_norm": 2.019948959350586, "learning_rate": 2.0312713551987764e-07, "loss": 1.9781, "num_input_tokens_seen": 544838688, "step": 535100 }, { "epoch": 10.537299915339332, "grad_norm": 1.9360240697860718, "learning_rate": 2.0296125707641515e-07, "loss": 2.001, "num_input_tokens_seen": 544941088, "step": 535200 }, { "epoch": 10.539268768088835, "grad_norm": 1.9009687900543213, "learning_rate": 2.0279542914188575e-07, "loss": 1.9559, "num_input_tokens_seen": 545042840, "step": 535300 }, { "epoch": 10.541237620838338, "grad_norm": 1.8150891065597534, "learning_rate": 2.026296517444867e-07, "loss": 2.0156, "num_input_tokens_seen": 545144968, "step": 535400 }, { "epoch": 10.54320647358784, "grad_norm": 2.0322012901306152, "learning_rate": 2.0246392491240726e-07, "loss": 1.9822, "num_input_tokens_seen": 545246592, "step": 535500 }, { "epoch": 10.545175326337343, "grad_norm": 1.6474072933197021, "learning_rate": 2.0229824867382784e-07, "loss": 2.0302, "num_input_tokens_seen": 545348992, "step": 535600 }, { "epoch": 10.547144179086846, "grad_norm": 1.8624540567398071, "learning_rate": 2.0213262305692046e-07, "loss": 1.9629, "num_input_tokens_seen": 545450464, "step": 535700 }, { "epoch": 10.549113031836349, "grad_norm": 1.7304649353027344, "learning_rate": 2.0196704808984805e-07, "loss": 1.9757, "num_input_tokens_seen": 545552864, "step": 535800 }, { "epoch": 10.551081884585852, "grad_norm": 1.6681280136108398, "learning_rate": 2.0180152380076532e-07, "loss": 2.0186, "num_input_tokens_seen": 545654192, "step": 535900 }, { "epoch": 10.553050737335354, "grad_norm": 1.9983810186386108, "learning_rate": 2.0163605021781843e-07, "loss": 2.0142, "num_input_tokens_seen": 545755120, "step": 536000 }, { "epoch": 10.555019590084857, "grad_norm": 2.0335702896118164, "learning_rate": 2.0147062736914468e-07, "loss": 1.9556, "num_input_tokens_seen": 545857024, "step": 536100 }, { "epoch": 10.55698844283436, "grad_norm": 1.5784399509429932, "learning_rate": 2.0130525528287297e-07, "loss": 2.0136, "num_input_tokens_seen": 545959424, "step": 536200 }, { "epoch": 10.558957295583863, "grad_norm": 2.0112950801849365, "learning_rate": 2.0113993398712291e-07, "loss": 1.9556, "num_input_tokens_seen": 546061824, "step": 536300 }, { "epoch": 10.560926148333365, "grad_norm": 8.489437103271484, "learning_rate": 2.0097466351000675e-07, "loss": 1.9847, "num_input_tokens_seen": 546163608, "step": 536400 }, { "epoch": 10.562895001082868, "grad_norm": 1.8609848022460938, "learning_rate": 2.0080944387962683e-07, "loss": 2.0082, "num_input_tokens_seen": 546265304, "step": 536500 }, { "epoch": 10.564863853832373, "grad_norm": 1.7672981023788452, "learning_rate": 2.0064427512407762e-07, "loss": 1.9857, "num_input_tokens_seen": 546366488, "step": 536600 }, { "epoch": 10.566832706581875, "grad_norm": 1.8648737668991089, "learning_rate": 2.0047915727144422e-07, "loss": 1.9564, "num_input_tokens_seen": 546466864, "step": 536700 }, { "epoch": 10.568801559331378, "grad_norm": 2.1185355186462402, "learning_rate": 2.0031409034980418e-07, "loss": 2.0156, "num_input_tokens_seen": 546569264, "step": 536800 }, { "epoch": 10.570770412080881, "grad_norm": 1.8192405700683594, "learning_rate": 2.0014907438722522e-07, "loss": 2.0054, "num_input_tokens_seen": 546671072, "step": 536900 }, { "epoch": 10.572739264830384, "grad_norm": 1.9865933656692505, "learning_rate": 1.999841094117673e-07, "loss": 2.0417, "num_input_tokens_seen": 546773472, "step": 537000 }, { "epoch": 10.574708117579886, "grad_norm": 1.863865613937378, "learning_rate": 1.9981919545148078e-07, "loss": 1.9848, "num_input_tokens_seen": 546875872, "step": 537100 }, { "epoch": 10.57667697032939, "grad_norm": 2.1517233848571777, "learning_rate": 1.996543325344084e-07, "loss": 1.9481, "num_input_tokens_seen": 546978272, "step": 537200 }, { "epoch": 10.578645823078892, "grad_norm": 15.011783599853516, "learning_rate": 1.994895206885837e-07, "loss": 2.0419, "num_input_tokens_seen": 547080048, "step": 537300 }, { "epoch": 10.580614675828395, "grad_norm": 5.901471138000488, "learning_rate": 1.9932475994203123e-07, "loss": 2.0005, "num_input_tokens_seen": 547180208, "step": 537400 }, { "epoch": 10.582583528577898, "grad_norm": 2.052987575531006, "learning_rate": 1.991600503227673e-07, "loss": 2.0015, "num_input_tokens_seen": 547281936, "step": 537500 }, { "epoch": 10.5845523813274, "grad_norm": 1.9954569339752197, "learning_rate": 1.9899539185879943e-07, "loss": 2.0246, "num_input_tokens_seen": 547383752, "step": 537600 }, { "epoch": 10.586521234076903, "grad_norm": 2.0651237964630127, "learning_rate": 1.988307845781265e-07, "loss": 2.0164, "num_input_tokens_seen": 547484800, "step": 537700 }, { "epoch": 10.588490086826406, "grad_norm": 1.91893470287323, "learning_rate": 1.9866622850873838e-07, "loss": 2.0221, "num_input_tokens_seen": 547585760, "step": 537800 }, { "epoch": 10.590458939575909, "grad_norm": 1.7563591003417969, "learning_rate": 1.985017236786165e-07, "loss": 1.9704, "num_input_tokens_seen": 547687624, "step": 537900 }, { "epoch": 10.592427792325411, "grad_norm": 1.728994607925415, "learning_rate": 1.983372701157336e-07, "loss": 2.0119, "num_input_tokens_seen": 547788776, "step": 538000 }, { "epoch": 10.594396645074914, "grad_norm": 1.9012501239776611, "learning_rate": 1.981728678480537e-07, "loss": 1.9796, "num_input_tokens_seen": 547889920, "step": 538100 }, { "epoch": 10.596365497824419, "grad_norm": 1.881460428237915, "learning_rate": 1.9800851690353171e-07, "loss": 1.9646, "num_input_tokens_seen": 547992320, "step": 538200 }, { "epoch": 10.598334350573921, "grad_norm": 2.05285906791687, "learning_rate": 1.9784421731011435e-07, "loss": 2.0002, "num_input_tokens_seen": 548094264, "step": 538300 }, { "epoch": 10.600303203323424, "grad_norm": 1.824878215789795, "learning_rate": 1.9767996909573931e-07, "loss": 1.9559, "num_input_tokens_seen": 548196064, "step": 538400 }, { "epoch": 10.602272056072927, "grad_norm": 1.8374119997024536, "learning_rate": 1.9751577228833565e-07, "loss": 2.0164, "num_input_tokens_seen": 548297648, "step": 538500 }, { "epoch": 10.60424090882243, "grad_norm": 21.481945037841797, "learning_rate": 1.973516269158238e-07, "loss": 2.036, "num_input_tokens_seen": 548399568, "step": 538600 }, { "epoch": 10.606209761571932, "grad_norm": 1.8211166858673096, "learning_rate": 1.9718753300611495e-07, "loss": 1.9437, "num_input_tokens_seen": 548501968, "step": 538700 }, { "epoch": 10.608178614321435, "grad_norm": 2.0096049308776855, "learning_rate": 1.9702349058711216e-07, "loss": 1.9995, "num_input_tokens_seen": 548603816, "step": 538800 }, { "epoch": 10.610147467070938, "grad_norm": 2.055584192276001, "learning_rate": 1.968594996867093e-07, "loss": 1.9478, "num_input_tokens_seen": 548705456, "step": 538900 }, { "epoch": 10.61211631982044, "grad_norm": 1.9225744009017944, "learning_rate": 1.9669556033279188e-07, "loss": 2.0025, "num_input_tokens_seen": 548807856, "step": 539000 }, { "epoch": 10.614085172569943, "grad_norm": 1.860952615737915, "learning_rate": 1.965316725532361e-07, "loss": 2.0287, "num_input_tokens_seen": 548910256, "step": 539100 }, { "epoch": 10.616054025319446, "grad_norm": 1.8782844543457031, "learning_rate": 1.9636783637590964e-07, "loss": 1.9812, "num_input_tokens_seen": 549011888, "step": 539200 }, { "epoch": 10.618022878068949, "grad_norm": 2.0483462810516357, "learning_rate": 1.96204051828672e-07, "loss": 1.9846, "num_input_tokens_seen": 549113040, "step": 539300 }, { "epoch": 10.619991730818452, "grad_norm": 2.2186572551727295, "learning_rate": 1.9604031893937284e-07, "loss": 1.9836, "num_input_tokens_seen": 549215440, "step": 539400 }, { "epoch": 10.621960583567954, "grad_norm": 1.8505703210830688, "learning_rate": 1.9587663773585395e-07, "loss": 1.9707, "num_input_tokens_seen": 549315832, "step": 539500 }, { "epoch": 10.623929436317457, "grad_norm": 1.854048728942871, "learning_rate": 1.9571300824594727e-07, "loss": 1.9586, "num_input_tokens_seen": 549418232, "step": 539600 }, { "epoch": 10.62589828906696, "grad_norm": 4.737663745880127, "learning_rate": 1.955494304974775e-07, "loss": 2.0373, "num_input_tokens_seen": 549520480, "step": 539700 }, { "epoch": 10.627867141816463, "grad_norm": 1.8604631423950195, "learning_rate": 1.9538590451825898e-07, "loss": 2.0142, "num_input_tokens_seen": 549622152, "step": 539800 }, { "epoch": 10.629835994565966, "grad_norm": 1.6579934358596802, "learning_rate": 1.952224303360984e-07, "loss": 1.9897, "num_input_tokens_seen": 549724552, "step": 539900 }, { "epoch": 10.63180484731547, "grad_norm": 2.0413172245025635, "learning_rate": 1.9505900797879255e-07, "loss": 1.998, "num_input_tokens_seen": 549825912, "step": 540000 }, { "epoch": 10.633773700064973, "grad_norm": 1.9077850580215454, "learning_rate": 1.9489563747413075e-07, "loss": 1.9597, "num_input_tokens_seen": 549928312, "step": 540100 }, { "epoch": 10.635742552814476, "grad_norm": 1.9252736568450928, "learning_rate": 1.9473231884989228e-07, "loss": 2.0268, "num_input_tokens_seen": 550030712, "step": 540200 }, { "epoch": 10.637711405563978, "grad_norm": 1.7537686824798584, "learning_rate": 1.9456905213384818e-07, "loss": 1.9472, "num_input_tokens_seen": 550133112, "step": 540300 }, { "epoch": 10.639680258313481, "grad_norm": 1.9000575542449951, "learning_rate": 1.944058373537607e-07, "loss": 1.9933, "num_input_tokens_seen": 550233992, "step": 540400 }, { "epoch": 10.641649111062984, "grad_norm": 2.1509549617767334, "learning_rate": 1.9424267453738307e-07, "loss": 2.0051, "num_input_tokens_seen": 550336392, "step": 540500 }, { "epoch": 10.643617963812487, "grad_norm": 1.662351369857788, "learning_rate": 1.9407956371245993e-07, "loss": 1.9517, "num_input_tokens_seen": 550438792, "step": 540600 }, { "epoch": 10.64558681656199, "grad_norm": 2.894723892211914, "learning_rate": 1.939165049067265e-07, "loss": 1.9642, "num_input_tokens_seen": 550540632, "step": 540700 }, { "epoch": 10.647555669311492, "grad_norm": 1.9298900365829468, "learning_rate": 1.9375349814790985e-07, "loss": 1.9798, "num_input_tokens_seen": 550643032, "step": 540800 }, { "epoch": 10.649524522060995, "grad_norm": 2.13678240776062, "learning_rate": 1.9359054346372784e-07, "loss": 2.0034, "num_input_tokens_seen": 550744768, "step": 540900 }, { "epoch": 10.651493374810498, "grad_norm": 1.8163970708847046, "learning_rate": 1.9342764088188967e-07, "loss": 1.9512, "num_input_tokens_seen": 550847168, "step": 541000 }, { "epoch": 10.65346222756, "grad_norm": 1.9950670003890991, "learning_rate": 1.9326479043009526e-07, "loss": 1.9809, "num_input_tokens_seen": 550949000, "step": 541100 }, { "epoch": 10.655431080309503, "grad_norm": 1.9394340515136719, "learning_rate": 1.9310199213603611e-07, "loss": 1.9153, "num_input_tokens_seen": 551051400, "step": 541200 }, { "epoch": 10.657399933059006, "grad_norm": 2.0396976470947266, "learning_rate": 1.929392460273947e-07, "loss": 1.9686, "num_input_tokens_seen": 551152200, "step": 541300 }, { "epoch": 10.659368785808509, "grad_norm": 1.7576956748962402, "learning_rate": 1.9277655213184462e-07, "loss": 1.9648, "num_input_tokens_seen": 551254600, "step": 541400 }, { "epoch": 10.661337638558011, "grad_norm": 2.1296796798706055, "learning_rate": 1.9261391047705083e-07, "loss": 1.9509, "num_input_tokens_seen": 551356448, "step": 541500 }, { "epoch": 10.663306491307516, "grad_norm": 1.871614933013916, "learning_rate": 1.9245132109066874e-07, "loss": 1.9598, "num_input_tokens_seen": 551458264, "step": 541600 }, { "epoch": 10.665275344057019, "grad_norm": 2.2402889728546143, "learning_rate": 1.9228878400034548e-07, "loss": 1.9813, "num_input_tokens_seen": 551560664, "step": 541700 }, { "epoch": 10.667244196806521, "grad_norm": 2.0696139335632324, "learning_rate": 1.921262992337192e-07, "loss": 1.9625, "num_input_tokens_seen": 551662648, "step": 541800 }, { "epoch": 10.669213049556024, "grad_norm": 1.936038613319397, "learning_rate": 1.9196386681841914e-07, "loss": 1.9948, "num_input_tokens_seen": 551763808, "step": 541900 }, { "epoch": 10.671181902305527, "grad_norm": 2.043972969055176, "learning_rate": 1.918014867820653e-07, "loss": 1.975, "num_input_tokens_seen": 551866208, "step": 542000 }, { "epoch": 10.67315075505503, "grad_norm": 2.106046676635742, "learning_rate": 1.9163915915226914e-07, "loss": 1.9942, "num_input_tokens_seen": 551968296, "step": 542100 }, { "epoch": 10.675119607804533, "grad_norm": 2.1447432041168213, "learning_rate": 1.9147688395663315e-07, "loss": 1.9886, "num_input_tokens_seen": 552069288, "step": 542200 }, { "epoch": 10.677088460554035, "grad_norm": 2.026850700378418, "learning_rate": 1.9131466122275085e-07, "loss": 1.9473, "num_input_tokens_seen": 552171088, "step": 542300 }, { "epoch": 10.679057313303538, "grad_norm": 2.0418670177459717, "learning_rate": 1.9115249097820695e-07, "loss": 1.9987, "num_input_tokens_seen": 552272080, "step": 542400 }, { "epoch": 10.68102616605304, "grad_norm": 1.7151198387145996, "learning_rate": 1.9099037325057672e-07, "loss": 1.9792, "num_input_tokens_seen": 552373664, "step": 542500 }, { "epoch": 10.682995018802544, "grad_norm": 2.2091479301452637, "learning_rate": 1.9082830806742755e-07, "loss": 2.0089, "num_input_tokens_seen": 552476064, "step": 542600 }, { "epoch": 10.684963871552046, "grad_norm": 2.1580698490142822, "learning_rate": 1.9066629545631675e-07, "loss": 1.971, "num_input_tokens_seen": 552577944, "step": 542700 }, { "epoch": 10.686932724301549, "grad_norm": 1.9133449792861938, "learning_rate": 1.9050433544479338e-07, "loss": 1.9845, "num_input_tokens_seen": 552680344, "step": 542800 }, { "epoch": 10.688901577051052, "grad_norm": 1.8320341110229492, "learning_rate": 1.903424280603973e-07, "loss": 1.9252, "num_input_tokens_seen": 552782264, "step": 542900 }, { "epoch": 10.690870429800555, "grad_norm": 2.1758272647857666, "learning_rate": 1.9018057333065979e-07, "loss": 2.0062, "num_input_tokens_seen": 552883224, "step": 543000 }, { "epoch": 10.692839282550057, "grad_norm": 1.948440670967102, "learning_rate": 1.9001877128310246e-07, "loss": 1.9744, "num_input_tokens_seen": 552984800, "step": 543100 }, { "epoch": 10.69480813529956, "grad_norm": 1.7925294637680054, "learning_rate": 1.8985702194523856e-07, "loss": 1.9985, "num_input_tokens_seen": 553087200, "step": 543200 }, { "epoch": 10.696776988049065, "grad_norm": 1.9080694913864136, "learning_rate": 1.8969532534457223e-07, "loss": 2.0328, "num_input_tokens_seen": 553188536, "step": 543300 }, { "epoch": 10.698745840798567, "grad_norm": 1.8982210159301758, "learning_rate": 1.8953368150859854e-07, "loss": 1.9933, "num_input_tokens_seen": 553289712, "step": 543400 }, { "epoch": 10.70071469354807, "grad_norm": 1.95067298412323, "learning_rate": 1.8937209046480395e-07, "loss": 1.9887, "num_input_tokens_seen": 553391664, "step": 543500 }, { "epoch": 10.702683546297573, "grad_norm": 1.7763067483901978, "learning_rate": 1.8921055224066522e-07, "loss": 1.9374, "num_input_tokens_seen": 553494064, "step": 543600 }, { "epoch": 10.704652399047076, "grad_norm": 2.1477227210998535, "learning_rate": 1.890490668636508e-07, "loss": 1.9607, "num_input_tokens_seen": 553596464, "step": 543700 }, { "epoch": 10.706621251796578, "grad_norm": 1.7865806818008423, "learning_rate": 1.8888763436121986e-07, "loss": 1.9777, "num_input_tokens_seen": 553697952, "step": 543800 }, { "epoch": 10.708590104546081, "grad_norm": 1.9819812774658203, "learning_rate": 1.8872625476082277e-07, "loss": 1.9756, "num_input_tokens_seen": 553800352, "step": 543900 }, { "epoch": 10.710558957295584, "grad_norm": 2.0499982833862305, "learning_rate": 1.8856492808990055e-07, "loss": 1.9781, "num_input_tokens_seen": 553901904, "step": 544000 }, { "epoch": 10.712527810045087, "grad_norm": 1.919305443763733, "learning_rate": 1.8840365437588552e-07, "loss": 2.0011, "num_input_tokens_seen": 554004304, "step": 544100 }, { "epoch": 10.71449666279459, "grad_norm": 1.708212971687317, "learning_rate": 1.8824243364620091e-07, "loss": 1.9838, "num_input_tokens_seen": 554106208, "step": 544200 }, { "epoch": 10.716465515544092, "grad_norm": 1.9649652242660522, "learning_rate": 1.88081265928261e-07, "loss": 1.985, "num_input_tokens_seen": 554208608, "step": 544300 }, { "epoch": 10.718434368293595, "grad_norm": 2.2554948329925537, "learning_rate": 1.8792015124947118e-07, "loss": 1.9327, "num_input_tokens_seen": 554311008, "step": 544400 }, { "epoch": 10.720403221043098, "grad_norm": 1.8740923404693604, "learning_rate": 1.8775908963722703e-07, "loss": 1.9693, "num_input_tokens_seen": 554412944, "step": 544500 }, { "epoch": 10.7223720737926, "grad_norm": 1.6839097738265991, "learning_rate": 1.875980811189165e-07, "loss": 1.9832, "num_input_tokens_seen": 554515344, "step": 544600 }, { "epoch": 10.724340926542103, "grad_norm": 2.1475319862365723, "learning_rate": 1.8743712572191722e-07, "loss": 1.924, "num_input_tokens_seen": 554617744, "step": 544700 }, { "epoch": 10.726309779291606, "grad_norm": 1.9010370969772339, "learning_rate": 1.8727622347359857e-07, "loss": 1.9734, "num_input_tokens_seen": 554718976, "step": 544800 }, { "epoch": 10.72827863204111, "grad_norm": 2.0754222869873047, "learning_rate": 1.8711537440132013e-07, "loss": 1.9793, "num_input_tokens_seen": 554821376, "step": 544900 }, { "epoch": 10.730247484790613, "grad_norm": 2.4671974182128906, "learning_rate": 1.8695457853243363e-07, "loss": 1.9605, "num_input_tokens_seen": 554923200, "step": 545000 }, { "epoch": 10.732216337540116, "grad_norm": 2.042914628982544, "learning_rate": 1.8679383589428056e-07, "loss": 1.9705, "num_input_tokens_seen": 555025600, "step": 545100 }, { "epoch": 10.734185190289619, "grad_norm": 1.9371339082717896, "learning_rate": 1.8663314651419416e-07, "loss": 2.0378, "num_input_tokens_seen": 555127408, "step": 545200 }, { "epoch": 10.736154043039122, "grad_norm": 2.2449545860290527, "learning_rate": 1.8647251041949775e-07, "loss": 1.996, "num_input_tokens_seen": 555228912, "step": 545300 }, { "epoch": 10.738122895788624, "grad_norm": 2.1624972820281982, "learning_rate": 1.8631192763750676e-07, "loss": 1.9941, "num_input_tokens_seen": 555331312, "step": 545400 }, { "epoch": 10.740091748538127, "grad_norm": 2.1800129413604736, "learning_rate": 1.8615139819552683e-07, "loss": 1.9384, "num_input_tokens_seen": 555432840, "step": 545500 }, { "epoch": 10.74206060128763, "grad_norm": 1.8301419019699097, "learning_rate": 1.8599092212085437e-07, "loss": 1.9991, "num_input_tokens_seen": 555535240, "step": 545600 }, { "epoch": 10.744029454037133, "grad_norm": 1.8693466186523438, "learning_rate": 1.858304994407771e-07, "loss": 1.996, "num_input_tokens_seen": 555636504, "step": 545700 }, { "epoch": 10.745998306786635, "grad_norm": 2.3593571186065674, "learning_rate": 1.856701301825736e-07, "loss": 2.0408, "num_input_tokens_seen": 555738904, "step": 545800 }, { "epoch": 10.747967159536138, "grad_norm": 1.9689140319824219, "learning_rate": 1.8550981437351343e-07, "loss": 1.9639, "num_input_tokens_seen": 555840704, "step": 545900 }, { "epoch": 10.74993601228564, "grad_norm": 2.083340883255005, "learning_rate": 1.8534955204085668e-07, "loss": 1.9826, "num_input_tokens_seen": 555942048, "step": 546000 }, { "epoch": 10.751904865035144, "grad_norm": 2.0336508750915527, "learning_rate": 1.8518934321185464e-07, "loss": 1.9606, "num_input_tokens_seen": 556043680, "step": 546100 }, { "epoch": 10.753873717784646, "grad_norm": 3.2070682048797607, "learning_rate": 1.850291879137496e-07, "loss": 1.9645, "num_input_tokens_seen": 556146080, "step": 546200 }, { "epoch": 10.75584257053415, "grad_norm": 1.8451169729232788, "learning_rate": 1.8486908617377456e-07, "loss": 1.9802, "num_input_tokens_seen": 556247888, "step": 546300 }, { "epoch": 10.757811423283652, "grad_norm": 1.7789429426193237, "learning_rate": 1.8470903801915366e-07, "loss": 1.9566, "num_input_tokens_seen": 556349440, "step": 546400 }, { "epoch": 10.759780276033155, "grad_norm": 1.8773837089538574, "learning_rate": 1.8454904347710138e-07, "loss": 1.9755, "num_input_tokens_seen": 556451840, "step": 546500 }, { "epoch": 10.761749128782657, "grad_norm": 1.8822336196899414, "learning_rate": 1.843891025748236e-07, "loss": 1.9436, "num_input_tokens_seen": 556554240, "step": 546600 }, { "epoch": 10.763717981532162, "grad_norm": 1.7115010023117065, "learning_rate": 1.8422921533951697e-07, "loss": 2.0207, "num_input_tokens_seen": 556655296, "step": 546700 }, { "epoch": 10.765686834281665, "grad_norm": 2.0768983364105225, "learning_rate": 1.8406938179836912e-07, "loss": 1.992, "num_input_tokens_seen": 556757696, "step": 546800 }, { "epoch": 10.767655687031167, "grad_norm": 1.9417039155960083, "learning_rate": 1.83909601978558e-07, "loss": 1.9803, "num_input_tokens_seen": 556859232, "step": 546900 }, { "epoch": 10.76962453978067, "grad_norm": 1.8766381740570068, "learning_rate": 1.8374987590725306e-07, "loss": 1.9494, "num_input_tokens_seen": 556961136, "step": 547000 }, { "epoch": 10.771593392530173, "grad_norm": 2.033201217651367, "learning_rate": 1.8359020361161437e-07, "loss": 1.9882, "num_input_tokens_seen": 557063536, "step": 547100 }, { "epoch": 10.773562245279676, "grad_norm": 2.4288578033447266, "learning_rate": 1.8343058511879296e-07, "loss": 2.014, "num_input_tokens_seen": 557164472, "step": 547200 }, { "epoch": 10.775531098029179, "grad_norm": 1.9378972053527832, "learning_rate": 1.832710204559303e-07, "loss": 1.9844, "num_input_tokens_seen": 557266872, "step": 547300 }, { "epoch": 10.777499950778681, "grad_norm": 1.8342008590698242, "learning_rate": 1.831115096501591e-07, "loss": 1.9789, "num_input_tokens_seen": 557367608, "step": 547400 }, { "epoch": 10.779468803528184, "grad_norm": 1.6611577272415161, "learning_rate": 1.829520527286032e-07, "loss": 2.0199, "num_input_tokens_seen": 557470008, "step": 547500 }, { "epoch": 10.781437656277687, "grad_norm": 1.9544681310653687, "learning_rate": 1.8279264971837644e-07, "loss": 2.0638, "num_input_tokens_seen": 557571088, "step": 547600 }, { "epoch": 10.78340650902719, "grad_norm": 1.902199625968933, "learning_rate": 1.8263330064658434e-07, "loss": 1.9636, "num_input_tokens_seen": 557673488, "step": 547700 }, { "epoch": 10.785375361776692, "grad_norm": 2.241950750350952, "learning_rate": 1.8247400554032233e-07, "loss": 1.9807, "num_input_tokens_seen": 557775312, "step": 547800 }, { "epoch": 10.787344214526195, "grad_norm": 1.654066801071167, "learning_rate": 1.8231476442667794e-07, "loss": 1.9661, "num_input_tokens_seen": 557876896, "step": 547900 }, { "epoch": 10.789313067275698, "grad_norm": 1.7208657264709473, "learning_rate": 1.8215557733272813e-07, "loss": 1.996, "num_input_tokens_seen": 557979296, "step": 548000 }, { "epoch": 10.7912819200252, "grad_norm": 1.879143476486206, "learning_rate": 1.8199644428554177e-07, "loss": 1.9259, "num_input_tokens_seen": 558081696, "step": 548100 }, { "epoch": 10.793250772774703, "grad_norm": 1.8063020706176758, "learning_rate": 1.8183736531217753e-07, "loss": 1.9632, "num_input_tokens_seen": 558184096, "step": 548200 }, { "epoch": 10.795219625524208, "grad_norm": 1.8183461427688599, "learning_rate": 1.8167834043968616e-07, "loss": 1.9789, "num_input_tokens_seen": 558285808, "step": 548300 }, { "epoch": 10.79718847827371, "grad_norm": 2.0113089084625244, "learning_rate": 1.81519369695108e-07, "loss": 1.9752, "num_input_tokens_seen": 558388136, "step": 548400 }, { "epoch": 10.799157331023213, "grad_norm": 1.9190913438796997, "learning_rate": 1.8136045310547483e-07, "loss": 1.9626, "num_input_tokens_seen": 558490088, "step": 548500 }, { "epoch": 10.801126183772716, "grad_norm": 2.0270869731903076, "learning_rate": 1.812015906978091e-07, "loss": 2.0121, "num_input_tokens_seen": 558592488, "step": 548600 }, { "epoch": 10.803095036522219, "grad_norm": 1.844152808189392, "learning_rate": 1.8104278249912398e-07, "loss": 1.9753, "num_input_tokens_seen": 558694888, "step": 548700 }, { "epoch": 10.805063889271722, "grad_norm": 1.7602601051330566, "learning_rate": 1.8088402853642365e-07, "loss": 1.9593, "num_input_tokens_seen": 558796840, "step": 548800 }, { "epoch": 10.807032742021224, "grad_norm": 2.2236902713775635, "learning_rate": 1.8072532883670256e-07, "loss": 1.9732, "num_input_tokens_seen": 558898864, "step": 548900 }, { "epoch": 10.809001594770727, "grad_norm": 2.0315370559692383, "learning_rate": 1.8056668342694637e-07, "loss": 1.9944, "num_input_tokens_seen": 558999960, "step": 549000 }, { "epoch": 10.81097044752023, "grad_norm": 1.8718507289886475, "learning_rate": 1.8040809233413147e-07, "loss": 1.9797, "num_input_tokens_seen": 559100584, "step": 549100 }, { "epoch": 10.812939300269733, "grad_norm": 2.095684289932251, "learning_rate": 1.8024955558522509e-07, "loss": 2.0122, "num_input_tokens_seen": 559202984, "step": 549200 }, { "epoch": 10.814908153019235, "grad_norm": 4.291043281555176, "learning_rate": 1.800910732071847e-07, "loss": 1.9312, "num_input_tokens_seen": 559304560, "step": 549300 }, { "epoch": 10.816877005768738, "grad_norm": 2.2442626953125, "learning_rate": 1.7993264522695905e-07, "loss": 1.986, "num_input_tokens_seen": 559405872, "step": 549400 }, { "epoch": 10.818845858518241, "grad_norm": 1.9597442150115967, "learning_rate": 1.7977427167148758e-07, "loss": 2.0036, "num_input_tokens_seen": 559508272, "step": 549500 }, { "epoch": 10.820814711267744, "grad_norm": 1.7878895998001099, "learning_rate": 1.7961595256770024e-07, "loss": 1.9619, "num_input_tokens_seen": 559610112, "step": 549600 }, { "epoch": 10.822783564017247, "grad_norm": 2.0321359634399414, "learning_rate": 1.794576879425181e-07, "loss": 2.0236, "num_input_tokens_seen": 559712512, "step": 549700 }, { "epoch": 10.82475241676675, "grad_norm": 1.8442186117172241, "learning_rate": 1.7929947782285239e-07, "loss": 1.9962, "num_input_tokens_seen": 559814912, "step": 549800 }, { "epoch": 10.826721269516252, "grad_norm": 2.162869691848755, "learning_rate": 1.7914132223560558e-07, "loss": 2.0282, "num_input_tokens_seen": 559917168, "step": 549900 }, { "epoch": 10.828690122265757, "grad_norm": 3.412938356399536, "learning_rate": 1.7898322120767063e-07, "loss": 2.0156, "num_input_tokens_seen": 560018000, "step": 550000 }, { "epoch": 10.83065897501526, "grad_norm": 1.93895423412323, "learning_rate": 1.788251747659315e-07, "loss": 2.0037, "num_input_tokens_seen": 560119568, "step": 550100 }, { "epoch": 10.832627827764762, "grad_norm": 2.056131601333618, "learning_rate": 1.7866718293726236e-07, "loss": 1.9807, "num_input_tokens_seen": 560221512, "step": 550200 }, { "epoch": 10.834596680514265, "grad_norm": 2.139589786529541, "learning_rate": 1.7850924574852854e-07, "loss": 2.0092, "num_input_tokens_seen": 560320424, "step": 550300 }, { "epoch": 10.836565533263768, "grad_norm": 2.064422369003296, "learning_rate": 1.7835136322658584e-07, "loss": 1.939, "num_input_tokens_seen": 560422480, "step": 550400 }, { "epoch": 10.83853438601327, "grad_norm": 1.7997618913650513, "learning_rate": 1.7819353539828097e-07, "loss": 1.9604, "num_input_tokens_seen": 560524240, "step": 550500 }, { "epoch": 10.840503238762773, "grad_norm": 1.8308550119400024, "learning_rate": 1.7803576229045125e-07, "loss": 1.9907, "num_input_tokens_seen": 560625744, "step": 550600 }, { "epoch": 10.842472091512276, "grad_norm": 1.782130241394043, "learning_rate": 1.7787804392992433e-07, "loss": 2.0138, "num_input_tokens_seen": 560727272, "step": 550700 }, { "epoch": 10.844440944261779, "grad_norm": 1.8118078708648682, "learning_rate": 1.7772038034351943e-07, "loss": 1.9414, "num_input_tokens_seen": 560829672, "step": 550800 }, { "epoch": 10.846409797011281, "grad_norm": 1.910395622253418, "learning_rate": 1.7756277155804545e-07, "loss": 1.9839, "num_input_tokens_seen": 560931520, "step": 550900 }, { "epoch": 10.848378649760784, "grad_norm": 1.6024221181869507, "learning_rate": 1.7740521760030264e-07, "loss": 1.9676, "num_input_tokens_seen": 561033416, "step": 551000 }, { "epoch": 10.850347502510287, "grad_norm": 1.9158639907836914, "learning_rate": 1.7724771849708164e-07, "loss": 1.9416, "num_input_tokens_seen": 561135816, "step": 551100 }, { "epoch": 10.85231635525979, "grad_norm": 2.0344066619873047, "learning_rate": 1.770902742751641e-07, "loss": 1.9874, "num_input_tokens_seen": 561237280, "step": 551200 }, { "epoch": 10.854285208009292, "grad_norm": 1.8660359382629395, "learning_rate": 1.769328849613217e-07, "loss": 2.0408, "num_input_tokens_seen": 561338968, "step": 551300 }, { "epoch": 10.856254060758795, "grad_norm": 2.2782697677612305, "learning_rate": 1.7677555058231731e-07, "loss": 1.9723, "num_input_tokens_seen": 561441368, "step": 551400 }, { "epoch": 10.858222913508298, "grad_norm": 1.8551559448242188, "learning_rate": 1.7661827116490435e-07, "loss": 1.9687, "num_input_tokens_seen": 561542992, "step": 551500 }, { "epoch": 10.860191766257802, "grad_norm": 1.6949559450149536, "learning_rate": 1.7646104673582685e-07, "loss": 1.9687, "num_input_tokens_seen": 561645392, "step": 551600 }, { "epoch": 10.862160619007305, "grad_norm": 1.8333609104156494, "learning_rate": 1.7630387732181973e-07, "loss": 1.9726, "num_input_tokens_seen": 561747792, "step": 551700 }, { "epoch": 10.864129471756808, "grad_norm": 1.9584308862686157, "learning_rate": 1.7614676294960785e-07, "loss": 1.9377, "num_input_tokens_seen": 561849584, "step": 551800 }, { "epoch": 10.86609832450631, "grad_norm": 1.767691731452942, "learning_rate": 1.7598970364590742e-07, "loss": 2.0214, "num_input_tokens_seen": 561951112, "step": 551900 }, { "epoch": 10.868067177255814, "grad_norm": 27.583389282226562, "learning_rate": 1.7583269943742513e-07, "loss": 2.0145, "num_input_tokens_seen": 562052280, "step": 552000 }, { "epoch": 10.870036030005316, "grad_norm": 1.7727301120758057, "learning_rate": 1.7567575035085824e-07, "loss": 2.0146, "num_input_tokens_seen": 562154680, "step": 552100 }, { "epoch": 10.872004882754819, "grad_norm": 1.8757035732269287, "learning_rate": 1.7551885641289437e-07, "loss": 1.9759, "num_input_tokens_seen": 562257080, "step": 552200 }, { "epoch": 10.873973735504322, "grad_norm": 1.8349326848983765, "learning_rate": 1.753620176502122e-07, "loss": 2.0018, "num_input_tokens_seen": 562359024, "step": 552300 }, { "epoch": 10.875942588253825, "grad_norm": 1.9872115850448608, "learning_rate": 1.752052340894808e-07, "loss": 2.0174, "num_input_tokens_seen": 562460808, "step": 552400 }, { "epoch": 10.877911441003327, "grad_norm": 1.9421464204788208, "learning_rate": 1.7504850575735984e-07, "loss": 1.9412, "num_input_tokens_seen": 562562624, "step": 552500 }, { "epoch": 10.87988029375283, "grad_norm": 1.8458161354064941, "learning_rate": 1.7489183268049994e-07, "loss": 2.0346, "num_input_tokens_seen": 562665024, "step": 552600 }, { "epoch": 10.881849146502333, "grad_norm": 2.0116019248962402, "learning_rate": 1.747352148855415e-07, "loss": 1.9846, "num_input_tokens_seen": 562767424, "step": 552700 }, { "epoch": 10.883817999251836, "grad_norm": 1.963964819908142, "learning_rate": 1.7457865239911663e-07, "loss": 1.9838, "num_input_tokens_seen": 562869016, "step": 552800 }, { "epoch": 10.885786852001338, "grad_norm": 35.831180572509766, "learning_rate": 1.7442214524784705e-07, "loss": 2.006, "num_input_tokens_seen": 562971416, "step": 552900 }, { "epoch": 10.887755704750841, "grad_norm": 2.1121113300323486, "learning_rate": 1.7426569345834573e-07, "loss": 1.9823, "num_input_tokens_seen": 563073816, "step": 553000 }, { "epoch": 10.889724557500344, "grad_norm": 21.630117416381836, "learning_rate": 1.741092970572156e-07, "loss": 2.0341, "num_input_tokens_seen": 563175296, "step": 553100 }, { "epoch": 10.891693410249847, "grad_norm": 1.8024646043777466, "learning_rate": 1.7395295607105116e-07, "loss": 2.0162, "num_input_tokens_seen": 563277328, "step": 553200 }, { "epoch": 10.89366226299935, "grad_norm": 2.0604710578918457, "learning_rate": 1.7379667052643632e-07, "loss": 1.9976, "num_input_tokens_seen": 563378568, "step": 553300 }, { "epoch": 10.895631115748854, "grad_norm": 2.136566638946533, "learning_rate": 1.736404404499463e-07, "loss": 1.9893, "num_input_tokens_seen": 563480968, "step": 553400 }, { "epoch": 10.897599968498357, "grad_norm": 1.924424171447754, "learning_rate": 1.734842658681468e-07, "loss": 2.036, "num_input_tokens_seen": 563581760, "step": 553500 }, { "epoch": 10.89956882124786, "grad_norm": 9.791131019592285, "learning_rate": 1.7332814680759396e-07, "loss": 1.9807, "num_input_tokens_seen": 563683200, "step": 553600 }, { "epoch": 10.901537673997362, "grad_norm": 1.996576189994812, "learning_rate": 1.7317208329483456e-07, "loss": 1.95, "num_input_tokens_seen": 563784976, "step": 553700 }, { "epoch": 10.903506526746865, "grad_norm": 1.685051679611206, "learning_rate": 1.7301607535640566e-07, "loss": 1.9791, "num_input_tokens_seen": 563886624, "step": 553800 }, { "epoch": 10.905475379496368, "grad_norm": 2.1437501907348633, "learning_rate": 1.7286012301883523e-07, "loss": 1.9887, "num_input_tokens_seen": 563989024, "step": 553900 }, { "epoch": 10.90744423224587, "grad_norm": 2.030673027038574, "learning_rate": 1.727042263086417e-07, "loss": 2.0081, "num_input_tokens_seen": 564089872, "step": 554000 }, { "epoch": 10.909413084995373, "grad_norm": 1.8840281963348389, "learning_rate": 1.7254838525233406e-07, "loss": 1.9486, "num_input_tokens_seen": 564192272, "step": 554100 }, { "epoch": 10.911381937744876, "grad_norm": 15.175052642822266, "learning_rate": 1.7239259987641152e-07, "loss": 2.0381, "num_input_tokens_seen": 564293720, "step": 554200 }, { "epoch": 10.913350790494379, "grad_norm": 1.842354416847229, "learning_rate": 1.7223687020736421e-07, "loss": 1.9525, "num_input_tokens_seen": 564396120, "step": 554300 }, { "epoch": 10.915319643243881, "grad_norm": 1.9844493865966797, "learning_rate": 1.7208119627167262e-07, "loss": 1.9901, "num_input_tokens_seen": 564497872, "step": 554400 }, { "epoch": 10.917288495993384, "grad_norm": 2.2026803493499756, "learning_rate": 1.7192557809580782e-07, "loss": 1.9713, "num_input_tokens_seen": 564600272, "step": 554500 }, { "epoch": 10.919257348742887, "grad_norm": 2.121382713317871, "learning_rate": 1.7177001570623152e-07, "loss": 1.9652, "num_input_tokens_seen": 564701904, "step": 554600 }, { "epoch": 10.92122620149239, "grad_norm": 1.7426812648773193, "learning_rate": 1.7161450912939546e-07, "loss": 1.9955, "num_input_tokens_seen": 564803376, "step": 554700 }, { "epoch": 10.923195054241893, "grad_norm": 1.8384121656417847, "learning_rate": 1.7145905839174241e-07, "loss": 1.9527, "num_input_tokens_seen": 564905776, "step": 554800 }, { "epoch": 10.925163906991397, "grad_norm": 2.107618808746338, "learning_rate": 1.7130366351970538e-07, "loss": 1.9534, "num_input_tokens_seen": 565008176, "step": 554900 }, { "epoch": 10.9271327597409, "grad_norm": 2.13130784034729, "learning_rate": 1.7114832453970822e-07, "loss": 1.9654, "num_input_tokens_seen": 565110576, "step": 555000 }, { "epoch": 10.929101612490403, "grad_norm": 1.6640058755874634, "learning_rate": 1.7099304147816467e-07, "loss": 1.9346, "num_input_tokens_seen": 565212792, "step": 555100 }, { "epoch": 10.931070465239905, "grad_norm": 1.8980098962783813, "learning_rate": 1.7083781436147943e-07, "loss": 1.969, "num_input_tokens_seen": 565315192, "step": 555200 }, { "epoch": 10.933039317989408, "grad_norm": 2.0015316009521484, "learning_rate": 1.7068264321604753e-07, "loss": 2.0155, "num_input_tokens_seen": 565415928, "step": 555300 }, { "epoch": 10.93500817073891, "grad_norm": 2.2221791744232178, "learning_rate": 1.7052752806825477e-07, "loss": 2.0495, "num_input_tokens_seen": 565517616, "step": 555400 }, { "epoch": 10.936977023488414, "grad_norm": 1.7417666912078857, "learning_rate": 1.7037246894447677e-07, "loss": 1.9904, "num_input_tokens_seen": 565619440, "step": 555500 }, { "epoch": 10.938945876237916, "grad_norm": 1.8724528551101685, "learning_rate": 1.702174658710801e-07, "loss": 1.9493, "num_input_tokens_seen": 565721296, "step": 555600 }, { "epoch": 10.94091472898742, "grad_norm": 1.9935745000839233, "learning_rate": 1.7006251887442213e-07, "loss": 1.9937, "num_input_tokens_seen": 565823608, "step": 555700 }, { "epoch": 10.942883581736922, "grad_norm": 1.8080589771270752, "learning_rate": 1.6990762798084984e-07, "loss": 2.0009, "num_input_tokens_seen": 565926008, "step": 555800 }, { "epoch": 10.944852434486425, "grad_norm": 2.0000579357147217, "learning_rate": 1.6975279321670148e-07, "loss": 1.9993, "num_input_tokens_seen": 566027696, "step": 555900 }, { "epoch": 10.946821287235927, "grad_norm": 1.9233328104019165, "learning_rate": 1.6959801460830487e-07, "loss": 2.0226, "num_input_tokens_seen": 566129400, "step": 556000 }, { "epoch": 10.94879013998543, "grad_norm": 1.6917973756790161, "learning_rate": 1.6944329218197943e-07, "loss": 1.9986, "num_input_tokens_seen": 566231272, "step": 556100 }, { "epoch": 10.950758992734933, "grad_norm": 1.933417558670044, "learning_rate": 1.6928862596403397e-07, "loss": 2.003, "num_input_tokens_seen": 566333672, "step": 556200 }, { "epoch": 10.952727845484436, "grad_norm": 1.990510106086731, "learning_rate": 1.6913401598076848e-07, "loss": 1.9752, "num_input_tokens_seen": 566436072, "step": 556300 }, { "epoch": 10.954696698233938, "grad_norm": 1.8237090110778809, "learning_rate": 1.6897946225847258e-07, "loss": 1.9981, "num_input_tokens_seen": 566537920, "step": 556400 }, { "epoch": 10.956665550983441, "grad_norm": 1.7059775590896606, "learning_rate": 1.6882496482342734e-07, "loss": 1.9692, "num_input_tokens_seen": 566640320, "step": 556500 }, { "epoch": 10.958634403732944, "grad_norm": 1.910999059677124, "learning_rate": 1.6867052370190376e-07, "loss": 1.9715, "num_input_tokens_seen": 566742720, "step": 556600 }, { "epoch": 10.960603256482448, "grad_norm": 1.8922381401062012, "learning_rate": 1.6851613892016286e-07, "loss": 1.9685, "num_input_tokens_seen": 566844328, "step": 556700 }, { "epoch": 10.962572109231951, "grad_norm": 2.10164475440979, "learning_rate": 1.683618105044567e-07, "loss": 1.9762, "num_input_tokens_seen": 566946600, "step": 556800 }, { "epoch": 10.964540961981454, "grad_norm": 1.8269226551055908, "learning_rate": 1.6820753848102748e-07, "loss": 1.9475, "num_input_tokens_seen": 567049000, "step": 556900 }, { "epoch": 10.966509814730957, "grad_norm": 1.7695835828781128, "learning_rate": 1.6805332287610807e-07, "loss": 2.0123, "num_input_tokens_seen": 567148664, "step": 557000 }, { "epoch": 10.96847866748046, "grad_norm": 1.875718593597412, "learning_rate": 1.678991637159211e-07, "loss": 2.0617, "num_input_tokens_seen": 567250544, "step": 557100 }, { "epoch": 10.970447520229962, "grad_norm": 1.7459222078323364, "learning_rate": 1.6774506102668035e-07, "loss": 1.9657, "num_input_tokens_seen": 567352448, "step": 557200 }, { "epoch": 10.972416372979465, "grad_norm": 2.0527901649475098, "learning_rate": 1.6759101483458954e-07, "loss": 1.9856, "num_input_tokens_seen": 567453400, "step": 557300 }, { "epoch": 10.974385225728968, "grad_norm": 1.7636523246765137, "learning_rate": 1.6743702516584323e-07, "loss": 2.0378, "num_input_tokens_seen": 567553656, "step": 557400 }, { "epoch": 10.97635407847847, "grad_norm": 1.9239298105239868, "learning_rate": 1.672830920466256e-07, "loss": 1.9628, "num_input_tokens_seen": 567656056, "step": 557500 }, { "epoch": 10.978322931227973, "grad_norm": 1.8326318264007568, "learning_rate": 1.6712921550311192e-07, "loss": 1.9569, "num_input_tokens_seen": 567758456, "step": 557600 }, { "epoch": 10.980291783977476, "grad_norm": 1.7801740169525146, "learning_rate": 1.6697539556146762e-07, "loss": 1.9886, "num_input_tokens_seen": 567860680, "step": 557700 }, { "epoch": 10.982260636726979, "grad_norm": 1.9315011501312256, "learning_rate": 1.668216322478484e-07, "loss": 2.0117, "num_input_tokens_seen": 567962464, "step": 557800 }, { "epoch": 10.984229489476482, "grad_norm": 2.050309419631958, "learning_rate": 1.6666792558840058e-07, "loss": 1.966, "num_input_tokens_seen": 568063992, "step": 557900 }, { "epoch": 10.986198342225984, "grad_norm": 1.7155494689941406, "learning_rate": 1.6651427560926053e-07, "loss": 1.9484, "num_input_tokens_seen": 568164104, "step": 558000 }, { "epoch": 10.988167194975487, "grad_norm": 3.1228291988372803, "learning_rate": 1.6636068233655504e-07, "loss": 1.9726, "num_input_tokens_seen": 568265776, "step": 558100 }, { "epoch": 10.99013604772499, "grad_norm": 1.9787405729293823, "learning_rate": 1.6620714579640155e-07, "loss": 1.9986, "num_input_tokens_seen": 568367792, "step": 558200 }, { "epoch": 10.992104900474494, "grad_norm": 1.7454941272735596, "learning_rate": 1.6605366601490767e-07, "loss": 1.9918, "num_input_tokens_seen": 568469384, "step": 558300 }, { "epoch": 10.994073753223997, "grad_norm": 2.203538179397583, "learning_rate": 1.659002430181712e-07, "loss": 2.0342, "num_input_tokens_seen": 568571048, "step": 558400 }, { "epoch": 10.9960426059735, "grad_norm": 2.1025588512420654, "learning_rate": 1.6574687683228016e-07, "loss": 2.0571, "num_input_tokens_seen": 568671840, "step": 558500 }, { "epoch": 10.998011458723003, "grad_norm": 18.16162109375, "learning_rate": 1.6559356748331394e-07, "loss": 1.9717, "num_input_tokens_seen": 568774240, "step": 558600 }, { "epoch": 10.999980311472505, "grad_norm": 1.8461054563522339, "learning_rate": 1.6544031499734085e-07, "loss": 2.0256, "num_input_tokens_seen": 568876208, "step": 558700 }, { "epoch": 11.001949164222008, "grad_norm": 1.747460126876831, "learning_rate": 1.6528711940042056e-07, "loss": 1.9573, "num_input_tokens_seen": 568978608, "step": 558800 }, { "epoch": 11.003918016971511, "grad_norm": 1.770186424255371, "learning_rate": 1.651339807186022e-07, "loss": 1.944, "num_input_tokens_seen": 569081008, "step": 558900 }, { "epoch": 11.005886869721014, "grad_norm": 1.6608457565307617, "learning_rate": 1.6498089897792634e-07, "loss": 2.0115, "num_input_tokens_seen": 569183408, "step": 559000 }, { "epoch": 11.007855722470516, "grad_norm": 1.6924635171890259, "learning_rate": 1.648278742044228e-07, "loss": 1.9466, "num_input_tokens_seen": 569284776, "step": 559100 }, { "epoch": 11.00982457522002, "grad_norm": 1.9252318143844604, "learning_rate": 1.646749064241124e-07, "loss": 1.9733, "num_input_tokens_seen": 569387176, "step": 559200 }, { "epoch": 11.011793427969522, "grad_norm": 1.762007474899292, "learning_rate": 1.6452199566300557e-07, "loss": 1.9397, "num_input_tokens_seen": 569489576, "step": 559300 }, { "epoch": 11.013762280719025, "grad_norm": 1.9775484800338745, "learning_rate": 1.6436914194710416e-07, "loss": 1.9486, "num_input_tokens_seen": 569591344, "step": 559400 }, { "epoch": 11.015731133468528, "grad_norm": 1.9309196472167969, "learning_rate": 1.6421634530239913e-07, "loss": 2.0292, "num_input_tokens_seen": 569693144, "step": 559500 }, { "epoch": 11.01769998621803, "grad_norm": 2.1214234828948975, "learning_rate": 1.6406360575487254e-07, "loss": 2.0163, "num_input_tokens_seen": 569793808, "step": 559600 }, { "epoch": 11.019668838967533, "grad_norm": 1.9098743200302124, "learning_rate": 1.6391092333049627e-07, "loss": 1.9609, "num_input_tokens_seen": 569894256, "step": 559700 }, { "epoch": 11.021637691717036, "grad_norm": 2.009169340133667, "learning_rate": 1.6375829805523285e-07, "loss": 2.0057, "num_input_tokens_seen": 569996112, "step": 559800 }, { "epoch": 11.023606544466539, "grad_norm": 2.2146265506744385, "learning_rate": 1.6360572995503503e-07, "loss": 1.9741, "num_input_tokens_seen": 570098512, "step": 559900 }, { "epoch": 11.025575397216043, "grad_norm": 2.114055633544922, "learning_rate": 1.634532190558453e-07, "loss": 1.9892, "num_input_tokens_seen": 570200912, "step": 560000 }, { "epoch": 11.027544249965546, "grad_norm": 1.9288376569747925, "learning_rate": 1.6330076538359717e-07, "loss": 2.0215, "num_input_tokens_seen": 570302456, "step": 560100 }, { "epoch": 11.029513102715049, "grad_norm": 1.7856731414794922, "learning_rate": 1.63148368964214e-07, "loss": 1.938, "num_input_tokens_seen": 570404128, "step": 560200 }, { "epoch": 11.031481955464551, "grad_norm": 2.039823293685913, "learning_rate": 1.6299602982360977e-07, "loss": 1.9825, "num_input_tokens_seen": 570504216, "step": 560300 }, { "epoch": 11.033450808214054, "grad_norm": 31.957164764404297, "learning_rate": 1.6284374798768802e-07, "loss": 2.0262, "num_input_tokens_seen": 570605856, "step": 560400 }, { "epoch": 11.035419660963557, "grad_norm": 1.9994714260101318, "learning_rate": 1.6269152348234328e-07, "loss": 1.9857, "num_input_tokens_seen": 570706544, "step": 560500 }, { "epoch": 11.03738851371306, "grad_norm": 2.319862127304077, "learning_rate": 1.6253935633345995e-07, "loss": 1.9742, "num_input_tokens_seen": 570808264, "step": 560600 }, { "epoch": 11.039357366462562, "grad_norm": 1.614006519317627, "learning_rate": 1.6238724656691277e-07, "loss": 1.9635, "num_input_tokens_seen": 570910664, "step": 560700 }, { "epoch": 11.041326219212065, "grad_norm": 1.8084412813186646, "learning_rate": 1.6223519420856697e-07, "loss": 1.9952, "num_input_tokens_seen": 571011392, "step": 560800 }, { "epoch": 11.043295071961568, "grad_norm": 1.8977243900299072, "learning_rate": 1.6208319928427732e-07, "loss": 1.9233, "num_input_tokens_seen": 571113056, "step": 560900 }, { "epoch": 11.04526392471107, "grad_norm": 1.9307817220687866, "learning_rate": 1.6193126181988958e-07, "loss": 2.0118, "num_input_tokens_seen": 571214160, "step": 561000 }, { "epoch": 11.047232777460573, "grad_norm": 2.055455207824707, "learning_rate": 1.6177938184123928e-07, "loss": 1.9679, "num_input_tokens_seen": 571315792, "step": 561100 }, { "epoch": 11.049201630210076, "grad_norm": 1.7375178337097168, "learning_rate": 1.6162755937415268e-07, "loss": 1.9897, "num_input_tokens_seen": 571416736, "step": 561200 }, { "epoch": 11.051170482959579, "grad_norm": 1.7575231790542603, "learning_rate": 1.6147579444444526e-07, "loss": 1.978, "num_input_tokens_seen": 571518296, "step": 561300 }, { "epoch": 11.053139335709082, "grad_norm": 1.8998470306396484, "learning_rate": 1.6132408707792411e-07, "loss": 2.0047, "num_input_tokens_seen": 571619800, "step": 561400 }, { "epoch": 11.055108188458584, "grad_norm": 2.5749189853668213, "learning_rate": 1.611724373003852e-07, "loss": 1.9374, "num_input_tokens_seen": 571721408, "step": 561500 }, { "epoch": 11.057077041208087, "grad_norm": 1.9108294248580933, "learning_rate": 1.610208451376156e-07, "loss": 1.9598, "num_input_tokens_seen": 571822328, "step": 561600 }, { "epoch": 11.059045893957592, "grad_norm": 2.0555667877197266, "learning_rate": 1.6086931061539217e-07, "loss": 1.9683, "num_input_tokens_seen": 571923528, "step": 561700 }, { "epoch": 11.061014746707095, "grad_norm": 2.043241024017334, "learning_rate": 1.607178337594821e-07, "loss": 1.9853, "num_input_tokens_seen": 572025928, "step": 561800 }, { "epoch": 11.062983599456597, "grad_norm": 1.7151275873184204, "learning_rate": 1.6056641459564292e-07, "loss": 1.9622, "num_input_tokens_seen": 572128328, "step": 561900 }, { "epoch": 11.0649524522061, "grad_norm": 1.8281558752059937, "learning_rate": 1.604150531496219e-07, "loss": 1.9895, "num_input_tokens_seen": 572230728, "step": 562000 }, { "epoch": 11.066921304955603, "grad_norm": 2.052183151245117, "learning_rate": 1.6026374944715682e-07, "loss": 1.9744, "num_input_tokens_seen": 572333128, "step": 562100 }, { "epoch": 11.068890157705106, "grad_norm": 2.437272548675537, "learning_rate": 1.6011250351397575e-07, "loss": 1.952, "num_input_tokens_seen": 572435528, "step": 562200 }, { "epoch": 11.070859010454608, "grad_norm": 1.8878471851348877, "learning_rate": 1.5996131537579688e-07, "loss": 1.9921, "num_input_tokens_seen": 572537136, "step": 562300 }, { "epoch": 11.072827863204111, "grad_norm": 1.868731141090393, "learning_rate": 1.598101850583281e-07, "loss": 1.965, "num_input_tokens_seen": 572639536, "step": 562400 }, { "epoch": 11.074796715953614, "grad_norm": 1.9930466413497925, "learning_rate": 1.596591125872681e-07, "loss": 2.0186, "num_input_tokens_seen": 572741464, "step": 562500 }, { "epoch": 11.076765568703117, "grad_norm": 2.220581293106079, "learning_rate": 1.595080979883054e-07, "loss": 2.0402, "num_input_tokens_seen": 572843864, "step": 562600 }, { "epoch": 11.07873442145262, "grad_norm": 1.8037691116333008, "learning_rate": 1.5935714128711886e-07, "loss": 1.9514, "num_input_tokens_seen": 572946264, "step": 562700 }, { "epoch": 11.080703274202122, "grad_norm": 1.759264588356018, "learning_rate": 1.5920624250937743e-07, "loss": 1.9852, "num_input_tokens_seen": 573048664, "step": 562800 }, { "epoch": 11.082672126951625, "grad_norm": 2.2461397647857666, "learning_rate": 1.5905540168073993e-07, "loss": 1.95, "num_input_tokens_seen": 573151064, "step": 562900 }, { "epoch": 11.084640979701128, "grad_norm": 1.8784043788909912, "learning_rate": 1.589046188268557e-07, "loss": 1.9567, "num_input_tokens_seen": 573253464, "step": 563000 }, { "epoch": 11.08660983245063, "grad_norm": 2.1749541759490967, "learning_rate": 1.5875389397336413e-07, "loss": 1.9779, "num_input_tokens_seen": 573355048, "step": 563100 }, { "epoch": 11.088578685200133, "grad_norm": 2.208847999572754, "learning_rate": 1.5860322714589487e-07, "loss": 1.9522, "num_input_tokens_seen": 573456920, "step": 563200 }, { "epoch": 11.090547537949636, "grad_norm": 1.80336332321167, "learning_rate": 1.5845261837006712e-07, "loss": 1.9963, "num_input_tokens_seen": 573559216, "step": 563300 }, { "epoch": 11.09251639069914, "grad_norm": 1.8268429040908813, "learning_rate": 1.5830206767149097e-07, "loss": 1.994, "num_input_tokens_seen": 573661616, "step": 563400 }, { "epoch": 11.094485243448643, "grad_norm": 1.6226472854614258, "learning_rate": 1.581515750757661e-07, "loss": 1.993, "num_input_tokens_seen": 573761264, "step": 563500 }, { "epoch": 11.096454096198146, "grad_norm": 1.9020010232925415, "learning_rate": 1.5800114060848268e-07, "loss": 1.9719, "num_input_tokens_seen": 573863528, "step": 563600 }, { "epoch": 11.098422948947649, "grad_norm": 2.0572657585144043, "learning_rate": 1.5785076429522089e-07, "loss": 1.9866, "num_input_tokens_seen": 573964744, "step": 563700 }, { "epoch": 11.100391801697151, "grad_norm": 2.0211639404296875, "learning_rate": 1.5770044616155048e-07, "loss": 1.9711, "num_input_tokens_seen": 574067144, "step": 563800 }, { "epoch": 11.102360654446654, "grad_norm": 1.8248523473739624, "learning_rate": 1.5755018623303247e-07, "loss": 1.9967, "num_input_tokens_seen": 574169232, "step": 563900 }, { "epoch": 11.104329507196157, "grad_norm": 1.644024133682251, "learning_rate": 1.5739998453521668e-07, "loss": 1.9236, "num_input_tokens_seen": 574271632, "step": 564000 }, { "epoch": 11.10629835994566, "grad_norm": 2.086744785308838, "learning_rate": 1.5724984109364414e-07, "loss": 1.9722, "num_input_tokens_seen": 574373296, "step": 564100 }, { "epoch": 11.108267212695162, "grad_norm": 2.0320801734924316, "learning_rate": 1.5709975593384489e-07, "loss": 1.9822, "num_input_tokens_seen": 574475408, "step": 564200 }, { "epoch": 11.110236065444665, "grad_norm": 2.0280330181121826, "learning_rate": 1.5694972908134034e-07, "loss": 2.032, "num_input_tokens_seen": 574576424, "step": 564300 }, { "epoch": 11.112204918194168, "grad_norm": 1.6259900331497192, "learning_rate": 1.5679976056164074e-07, "loss": 2.0213, "num_input_tokens_seen": 574678512, "step": 564400 }, { "epoch": 11.11417377094367, "grad_norm": 1.9069445133209229, "learning_rate": 1.5664985040024737e-07, "loss": 1.9384, "num_input_tokens_seen": 574779736, "step": 564500 }, { "epoch": 11.116142623693174, "grad_norm": 1.8923521041870117, "learning_rate": 1.564999986226506e-07, "loss": 2.0487, "num_input_tokens_seen": 574881512, "step": 564600 }, { "epoch": 11.118111476442676, "grad_norm": 2.391019582748413, "learning_rate": 1.5635020525433207e-07, "loss": 1.9697, "num_input_tokens_seen": 574983912, "step": 564700 }, { "epoch": 11.120080329192179, "grad_norm": 2.0226306915283203, "learning_rate": 1.5620047032076282e-07, "loss": 1.9658, "num_input_tokens_seen": 575086312, "step": 564800 }, { "epoch": 11.122049181941682, "grad_norm": 2.0029852390289307, "learning_rate": 1.5605079384740373e-07, "loss": 2.0302, "num_input_tokens_seen": 575188712, "step": 564900 }, { "epoch": 11.124018034691186, "grad_norm": 2.1440653800964355, "learning_rate": 1.5590117585970608e-07, "loss": 1.9845, "num_input_tokens_seen": 575291112, "step": 565000 }, { "epoch": 11.125986887440689, "grad_norm": 1.9851350784301758, "learning_rate": 1.5575161638311124e-07, "loss": 1.9315, "num_input_tokens_seen": 575393512, "step": 565100 }, { "epoch": 11.127955740190192, "grad_norm": 2.07045316696167, "learning_rate": 1.5560211544305063e-07, "loss": 1.9842, "num_input_tokens_seen": 575495392, "step": 565200 }, { "epoch": 11.129924592939695, "grad_norm": 2.118616819381714, "learning_rate": 1.5545267306494537e-07, "loss": 1.9941, "num_input_tokens_seen": 575597792, "step": 565300 }, { "epoch": 11.131893445689197, "grad_norm": 1.6898269653320312, "learning_rate": 1.5530328927420705e-07, "loss": 1.998, "num_input_tokens_seen": 575699280, "step": 565400 }, { "epoch": 11.1338622984387, "grad_norm": 1.656571865081787, "learning_rate": 1.5515396409623704e-07, "loss": 1.9904, "num_input_tokens_seen": 575799888, "step": 565500 }, { "epoch": 11.135831151188203, "grad_norm": 2.069849967956543, "learning_rate": 1.5500469755642702e-07, "loss": 1.9609, "num_input_tokens_seen": 575902288, "step": 565600 }, { "epoch": 11.137800003937706, "grad_norm": 2.019660234451294, "learning_rate": 1.548554896801582e-07, "loss": 1.9981, "num_input_tokens_seen": 576002808, "step": 565700 }, { "epoch": 11.139768856687208, "grad_norm": 1.7920044660568237, "learning_rate": 1.5470634049280228e-07, "loss": 2.0262, "num_input_tokens_seen": 576104288, "step": 565800 }, { "epoch": 11.141737709436711, "grad_norm": 2.0878922939300537, "learning_rate": 1.545572500197208e-07, "loss": 2.0091, "num_input_tokens_seen": 576206136, "step": 565900 }, { "epoch": 11.143706562186214, "grad_norm": 1.9078084230422974, "learning_rate": 1.5440821828626538e-07, "loss": 1.9671, "num_input_tokens_seen": 576308536, "step": 566000 }, { "epoch": 11.145675414935717, "grad_norm": 1.8267896175384521, "learning_rate": 1.542592453177778e-07, "loss": 1.98, "num_input_tokens_seen": 576410368, "step": 566100 }, { "epoch": 11.14764426768522, "grad_norm": 2.090945243835449, "learning_rate": 1.541103311395892e-07, "loss": 1.9989, "num_input_tokens_seen": 576511456, "step": 566200 }, { "epoch": 11.149613120434722, "grad_norm": 2.000983476638794, "learning_rate": 1.5396147577702145e-07, "loss": 1.9486, "num_input_tokens_seen": 576612360, "step": 566300 }, { "epoch": 11.151581973184225, "grad_norm": 1.9991554021835327, "learning_rate": 1.538126792553861e-07, "loss": 2.0102, "num_input_tokens_seen": 576713904, "step": 566400 }, { "epoch": 11.153550825933728, "grad_norm": 1.842217206954956, "learning_rate": 1.5366394159998498e-07, "loss": 1.9764, "num_input_tokens_seen": 576815784, "step": 566500 }, { "epoch": 11.15551967868323, "grad_norm": 1.7024178504943848, "learning_rate": 1.5351526283610922e-07, "loss": 1.9892, "num_input_tokens_seen": 576917360, "step": 566600 }, { "epoch": 11.157488531432735, "grad_norm": 1.959771990776062, "learning_rate": 1.533666429890405e-07, "loss": 1.9749, "num_input_tokens_seen": 577019760, "step": 566700 }, { "epoch": 11.159457384182238, "grad_norm": 2.245302200317383, "learning_rate": 1.5321808208405084e-07, "loss": 1.9923, "num_input_tokens_seen": 577121680, "step": 566800 }, { "epoch": 11.16142623693174, "grad_norm": 1.8373440504074097, "learning_rate": 1.5306958014640125e-07, "loss": 2.0039, "num_input_tokens_seen": 577224080, "step": 566900 }, { "epoch": 11.163395089681243, "grad_norm": 1.9024267196655273, "learning_rate": 1.529211372013436e-07, "loss": 1.9718, "num_input_tokens_seen": 577326480, "step": 567000 }, { "epoch": 11.165363942430746, "grad_norm": 1.8979979753494263, "learning_rate": 1.527727532741188e-07, "loss": 1.976, "num_input_tokens_seen": 577428200, "step": 567100 }, { "epoch": 11.167332795180249, "grad_norm": 1.8532240390777588, "learning_rate": 1.5262442838995903e-07, "loss": 1.9921, "num_input_tokens_seen": 577530600, "step": 567200 }, { "epoch": 11.169301647929752, "grad_norm": 2.6826493740081787, "learning_rate": 1.5247616257408508e-07, "loss": 1.9831, "num_input_tokens_seen": 577631408, "step": 567300 }, { "epoch": 11.171270500679254, "grad_norm": 1.8805820941925049, "learning_rate": 1.5232795585170876e-07, "loss": 1.9984, "num_input_tokens_seen": 577731176, "step": 567400 }, { "epoch": 11.173239353428757, "grad_norm": 2.616795539855957, "learning_rate": 1.5217980824803078e-07, "loss": 1.9783, "num_input_tokens_seen": 577833576, "step": 567500 }, { "epoch": 11.17520820617826, "grad_norm": 1.8681564331054688, "learning_rate": 1.5203171978824315e-07, "loss": 2.0714, "num_input_tokens_seen": 577935976, "step": 567600 }, { "epoch": 11.177177058927763, "grad_norm": 2.055142402648926, "learning_rate": 1.5188369049752643e-07, "loss": 1.951, "num_input_tokens_seen": 578038376, "step": 567700 }, { "epoch": 11.179145911677265, "grad_norm": 2.36122727394104, "learning_rate": 1.51735720401052e-07, "loss": 1.9677, "num_input_tokens_seen": 578140776, "step": 567800 }, { "epoch": 11.181114764426768, "grad_norm": 2.0014007091522217, "learning_rate": 1.5158780952398092e-07, "loss": 2.0331, "num_input_tokens_seen": 578243176, "step": 567900 }, { "epoch": 11.18308361717627, "grad_norm": 1.9691665172576904, "learning_rate": 1.5143995789146423e-07, "loss": 2.016, "num_input_tokens_seen": 578344376, "step": 568000 }, { "epoch": 11.185052469925774, "grad_norm": 2.918562412261963, "learning_rate": 1.5129216552864294e-07, "loss": 1.9612, "num_input_tokens_seen": 578446216, "step": 568100 }, { "epoch": 11.187021322675276, "grad_norm": 1.950661540031433, "learning_rate": 1.5114443246064762e-07, "loss": 1.9729, "num_input_tokens_seen": 578548616, "step": 568200 }, { "epoch": 11.18899017542478, "grad_norm": 1.8653844594955444, "learning_rate": 1.5099675871259914e-07, "loss": 1.9807, "num_input_tokens_seen": 578650312, "step": 568300 }, { "epoch": 11.190959028174284, "grad_norm": 1.8881200551986694, "learning_rate": 1.5084914430960827e-07, "loss": 1.978, "num_input_tokens_seen": 578752168, "step": 568400 }, { "epoch": 11.192927880923786, "grad_norm": 1.8521870374679565, "learning_rate": 1.5070158927677568e-07, "loss": 1.9887, "num_input_tokens_seen": 578853584, "step": 568500 }, { "epoch": 11.19489673367329, "grad_norm": 1.9622387886047363, "learning_rate": 1.5055409363919153e-07, "loss": 1.9914, "num_input_tokens_seen": 578955536, "step": 568600 }, { "epoch": 11.196865586422792, "grad_norm": 1.9085460901260376, "learning_rate": 1.5040665742193636e-07, "loss": 2.0241, "num_input_tokens_seen": 579057936, "step": 568700 }, { "epoch": 11.198834439172295, "grad_norm": 2.0087790489196777, "learning_rate": 1.5025928065008053e-07, "loss": 1.989, "num_input_tokens_seen": 579160336, "step": 568800 }, { "epoch": 11.200803291921797, "grad_norm": 2.1040682792663574, "learning_rate": 1.501119633486842e-07, "loss": 1.9584, "num_input_tokens_seen": 579262048, "step": 568900 }, { "epoch": 11.2027721446713, "grad_norm": 1.6661858558654785, "learning_rate": 1.4996470554279755e-07, "loss": 1.9962, "num_input_tokens_seen": 579364360, "step": 569000 }, { "epoch": 11.204740997420803, "grad_norm": 2.2188713550567627, "learning_rate": 1.4981750725746023e-07, "loss": 2.0264, "num_input_tokens_seen": 579465560, "step": 569100 }, { "epoch": 11.206709850170306, "grad_norm": 2.0795061588287354, "learning_rate": 1.496703685177022e-07, "loss": 1.9811, "num_input_tokens_seen": 579567960, "step": 569200 }, { "epoch": 11.208678702919809, "grad_norm": 1.8177028894424438, "learning_rate": 1.495232893485432e-07, "loss": 2.0287, "num_input_tokens_seen": 579670360, "step": 569300 }, { "epoch": 11.210647555669311, "grad_norm": 7.807389259338379, "learning_rate": 1.4937626977499295e-07, "loss": 1.982, "num_input_tokens_seen": 579771912, "step": 569400 }, { "epoch": 11.212616408418814, "grad_norm": 1.7488486766815186, "learning_rate": 1.4922930982205045e-07, "loss": 1.9384, "num_input_tokens_seen": 579874312, "step": 569500 }, { "epoch": 11.214585261168317, "grad_norm": 1.8570404052734375, "learning_rate": 1.490824095147055e-07, "loss": 2.005, "num_input_tokens_seen": 579976712, "step": 569600 }, { "epoch": 11.21655411391782, "grad_norm": 1.8749765157699585, "learning_rate": 1.4893556887793702e-07, "loss": 2.0216, "num_input_tokens_seen": 580077800, "step": 569700 }, { "epoch": 11.218522966667322, "grad_norm": 1.9788738489151, "learning_rate": 1.487887879367139e-07, "loss": 2.024, "num_input_tokens_seen": 580180200, "step": 569800 }, { "epoch": 11.220491819416825, "grad_norm": 2.037383794784546, "learning_rate": 1.4864206671599522e-07, "loss": 2.033, "num_input_tokens_seen": 580282600, "step": 569900 }, { "epoch": 11.222460672166328, "grad_norm": 1.9531891345977783, "learning_rate": 1.4849540524072956e-07, "loss": 2.0203, "num_input_tokens_seen": 580384776, "step": 570000 }, { "epoch": 11.224429524915832, "grad_norm": 2.8155910968780518, "learning_rate": 1.4834880353585573e-07, "loss": 1.9994, "num_input_tokens_seen": 580486544, "step": 570100 }, { "epoch": 11.226398377665335, "grad_norm": 1.8555750846862793, "learning_rate": 1.4820226162630168e-07, "loss": 1.9608, "num_input_tokens_seen": 580588184, "step": 570200 }, { "epoch": 11.228367230414838, "grad_norm": 4.470939636230469, "learning_rate": 1.4805577953698589e-07, "loss": 1.9514, "num_input_tokens_seen": 580690584, "step": 570300 }, { "epoch": 11.23033608316434, "grad_norm": 1.8449416160583496, "learning_rate": 1.4790935729281627e-07, "loss": 1.9708, "num_input_tokens_seen": 580792984, "step": 570400 }, { "epoch": 11.232304935913843, "grad_norm": 1.840404987335205, "learning_rate": 1.4776299491869104e-07, "loss": 2.0117, "num_input_tokens_seen": 580894040, "step": 570500 }, { "epoch": 11.234273788663346, "grad_norm": 1.9039394855499268, "learning_rate": 1.4761669243949738e-07, "loss": 1.9612, "num_input_tokens_seen": 580995568, "step": 570600 }, { "epoch": 11.236242641412849, "grad_norm": 1.957252860069275, "learning_rate": 1.474704498801131e-07, "loss": 1.9442, "num_input_tokens_seen": 581097968, "step": 570700 }, { "epoch": 11.238211494162352, "grad_norm": 1.939706802368164, "learning_rate": 1.4732426726540538e-07, "loss": 1.986, "num_input_tokens_seen": 581200368, "step": 570800 }, { "epoch": 11.240180346911854, "grad_norm": 1.6529273986816406, "learning_rate": 1.4717814462023142e-07, "loss": 1.9895, "num_input_tokens_seen": 581302768, "step": 570900 }, { "epoch": 11.242149199661357, "grad_norm": 1.947439432144165, "learning_rate": 1.4703208196943833e-07, "loss": 1.9941, "num_input_tokens_seen": 581404080, "step": 571000 }, { "epoch": 11.24411805241086, "grad_norm": 1.882500171661377, "learning_rate": 1.4688607933786247e-07, "loss": 1.9897, "num_input_tokens_seen": 581505192, "step": 571100 }, { "epoch": 11.246086905160363, "grad_norm": 2.2510392665863037, "learning_rate": 1.4674013675033047e-07, "loss": 2.0142, "num_input_tokens_seen": 581607592, "step": 571200 }, { "epoch": 11.248055757909865, "grad_norm": 1.9028221368789673, "learning_rate": 1.465942542316587e-07, "loss": 1.9901, "num_input_tokens_seen": 581708536, "step": 571300 }, { "epoch": 11.250024610659368, "grad_norm": 2.0776336193084717, "learning_rate": 1.464484318066534e-07, "loss": 2.0462, "num_input_tokens_seen": 581810296, "step": 571400 }, { "epoch": 11.251993463408871, "grad_norm": 5.531641006469727, "learning_rate": 1.4630266950011012e-07, "loss": 1.995, "num_input_tokens_seen": 581910408, "step": 571500 }, { "epoch": 11.253962316158374, "grad_norm": 1.7137070894241333, "learning_rate": 1.461569673368147e-07, "loss": 1.9868, "num_input_tokens_seen": 582012000, "step": 571600 }, { "epoch": 11.255931168907878, "grad_norm": 1.8212003707885742, "learning_rate": 1.460113253415425e-07, "loss": 1.9675, "num_input_tokens_seen": 582113920, "step": 571700 }, { "epoch": 11.257900021657381, "grad_norm": 2.0662999153137207, "learning_rate": 1.458657435390588e-07, "loss": 1.9635, "num_input_tokens_seen": 582216320, "step": 571800 }, { "epoch": 11.259868874406884, "grad_norm": 3.0008654594421387, "learning_rate": 1.457202219541186e-07, "loss": 1.9334, "num_input_tokens_seen": 582317976, "step": 571900 }, { "epoch": 11.261837727156387, "grad_norm": 1.8614851236343384, "learning_rate": 1.4557476061146624e-07, "loss": 1.9132, "num_input_tokens_seen": 582419872, "step": 572000 }, { "epoch": 11.26380657990589, "grad_norm": 2.2235891819000244, "learning_rate": 1.4542935953583673e-07, "loss": 1.9686, "num_input_tokens_seen": 582521016, "step": 572100 }, { "epoch": 11.265775432655392, "grad_norm": 2.6485838890075684, "learning_rate": 1.4528401875195395e-07, "loss": 1.9833, "num_input_tokens_seen": 582623416, "step": 572200 }, { "epoch": 11.267744285404895, "grad_norm": 1.9150540828704834, "learning_rate": 1.4513873828453215e-07, "loss": 1.9421, "num_input_tokens_seen": 582725088, "step": 572300 }, { "epoch": 11.269713138154398, "grad_norm": 2.054717779159546, "learning_rate": 1.449935181582745e-07, "loss": 1.9725, "num_input_tokens_seen": 582826864, "step": 572400 }, { "epoch": 11.2716819909039, "grad_norm": 2.01460337638855, "learning_rate": 1.4484835839787512e-07, "loss": 2.0038, "num_input_tokens_seen": 582929264, "step": 572500 }, { "epoch": 11.273650843653403, "grad_norm": 1.9440685510635376, "learning_rate": 1.4470325902801684e-07, "loss": 1.9878, "num_input_tokens_seen": 583030960, "step": 572600 }, { "epoch": 11.275619696402906, "grad_norm": 1.9856462478637695, "learning_rate": 1.4455822007337271e-07, "loss": 2.0348, "num_input_tokens_seen": 583131856, "step": 572700 }, { "epoch": 11.277588549152409, "grad_norm": 2.501225233078003, "learning_rate": 1.4441324155860506e-07, "loss": 1.9922, "num_input_tokens_seen": 583233440, "step": 572800 }, { "epoch": 11.279557401901911, "grad_norm": 2.163364887237549, "learning_rate": 1.4426832350836671e-07, "loss": 2.0571, "num_input_tokens_seen": 583335224, "step": 572900 }, { "epoch": 11.281526254651414, "grad_norm": 1.930635690689087, "learning_rate": 1.4412346594729967e-07, "loss": 1.9999, "num_input_tokens_seen": 583437624, "step": 573000 }, { "epoch": 11.283495107400917, "grad_norm": 1.994875192642212, "learning_rate": 1.439786689000355e-07, "loss": 1.9369, "num_input_tokens_seen": 583540024, "step": 573100 }, { "epoch": 11.28546396015042, "grad_norm": 2.055906057357788, "learning_rate": 1.4383393239119585e-07, "loss": 1.9874, "num_input_tokens_seen": 583641720, "step": 573200 }, { "epoch": 11.287432812899922, "grad_norm": 2.022674798965454, "learning_rate": 1.4368925644539199e-07, "loss": 1.9549, "num_input_tokens_seen": 583744120, "step": 573300 }, { "epoch": 11.289401665649427, "grad_norm": 1.8915902376174927, "learning_rate": 1.4354464108722497e-07, "loss": 1.9397, "num_input_tokens_seen": 583846520, "step": 573400 }, { "epoch": 11.29137051839893, "grad_norm": 2.067164659500122, "learning_rate": 1.4340008634128503e-07, "loss": 1.9893, "num_input_tokens_seen": 583948448, "step": 573500 }, { "epoch": 11.293339371148432, "grad_norm": 1.889007329940796, "learning_rate": 1.4325559223215278e-07, "loss": 1.9454, "num_input_tokens_seen": 584050848, "step": 573600 }, { "epoch": 11.295308223897935, "grad_norm": 1.9478282928466797, "learning_rate": 1.4311115878439812e-07, "loss": 2.02, "num_input_tokens_seen": 584153248, "step": 573700 }, { "epoch": 11.297277076647438, "grad_norm": 1.7752412557601929, "learning_rate": 1.429667860225807e-07, "loss": 1.997, "num_input_tokens_seen": 584255008, "step": 573800 }, { "epoch": 11.29924592939694, "grad_norm": 2.0746665000915527, "learning_rate": 1.428224739712502e-07, "loss": 1.9901, "num_input_tokens_seen": 584356096, "step": 573900 }, { "epoch": 11.301214782146443, "grad_norm": 2.286505699157715, "learning_rate": 1.426782226549453e-07, "loss": 1.9713, "num_input_tokens_seen": 584457408, "step": 574000 }, { "epoch": 11.303183634895946, "grad_norm": 1.9341868162155151, "learning_rate": 1.4253403209819476e-07, "loss": 1.9918, "num_input_tokens_seen": 584559808, "step": 574100 }, { "epoch": 11.305152487645449, "grad_norm": 1.6348973512649536, "learning_rate": 1.4238990232551713e-07, "loss": 1.9851, "num_input_tokens_seen": 584661440, "step": 574200 }, { "epoch": 11.307121340394952, "grad_norm": 2.006460428237915, "learning_rate": 1.4224583336142043e-07, "loss": 1.994, "num_input_tokens_seen": 584762864, "step": 574300 }, { "epoch": 11.309090193144455, "grad_norm": 1.9484412670135498, "learning_rate": 1.4210182523040226e-07, "loss": 2.0, "num_input_tokens_seen": 584865264, "step": 574400 }, { "epoch": 11.311059045893957, "grad_norm": 1.9613430500030518, "learning_rate": 1.4195787795695007e-07, "loss": 1.9854, "num_input_tokens_seen": 584966056, "step": 574500 }, { "epoch": 11.31302789864346, "grad_norm": 1.8905065059661865, "learning_rate": 1.4181399156554087e-07, "loss": 1.9642, "num_input_tokens_seen": 585065648, "step": 574600 }, { "epoch": 11.314996751392963, "grad_norm": 1.8770084381103516, "learning_rate": 1.4167016608064142e-07, "loss": 2.0091, "num_input_tokens_seen": 585166184, "step": 574700 }, { "epoch": 11.316965604142466, "grad_norm": 1.7596750259399414, "learning_rate": 1.4152640152670782e-07, "loss": 2.0012, "num_input_tokens_seen": 585268584, "step": 574800 }, { "epoch": 11.318934456891968, "grad_norm": 1.8129584789276123, "learning_rate": 1.4138269792818596e-07, "loss": 1.9249, "num_input_tokens_seen": 585370144, "step": 574900 }, { "epoch": 11.320903309641471, "grad_norm": 2.135826826095581, "learning_rate": 1.412390553095119e-07, "loss": 2.0058, "num_input_tokens_seen": 585471936, "step": 575000 }, { "epoch": 11.322872162390976, "grad_norm": 2.023653984069824, "learning_rate": 1.4109547369511044e-07, "loss": 2.0141, "num_input_tokens_seen": 585574336, "step": 575100 }, { "epoch": 11.324841015140478, "grad_norm": 2.0453250408172607, "learning_rate": 1.4095195310939668e-07, "loss": 1.9526, "num_input_tokens_seen": 585676200, "step": 575200 }, { "epoch": 11.326809867889981, "grad_norm": 1.8389403820037842, "learning_rate": 1.408084935767746e-07, "loss": 2.0132, "num_input_tokens_seen": 585777960, "step": 575300 }, { "epoch": 11.328778720639484, "grad_norm": 1.771575927734375, "learning_rate": 1.4066509512163898e-07, "loss": 1.9714, "num_input_tokens_seen": 585879760, "step": 575400 }, { "epoch": 11.330747573388987, "grad_norm": 1.759372353553772, "learning_rate": 1.40521757768373e-07, "loss": 1.9772, "num_input_tokens_seen": 585981528, "step": 575500 }, { "epoch": 11.33271642613849, "grad_norm": 1.9839931726455688, "learning_rate": 1.4037848154135028e-07, "loss": 1.9412, "num_input_tokens_seen": 586083232, "step": 575600 }, { "epoch": 11.334685278887992, "grad_norm": 1.7879927158355713, "learning_rate": 1.4023526646493328e-07, "loss": 1.9495, "num_input_tokens_seen": 586185632, "step": 575700 }, { "epoch": 11.336654131637495, "grad_norm": 1.7677305936813354, "learning_rate": 1.4009211256347492e-07, "loss": 1.9981, "num_input_tokens_seen": 586287736, "step": 575800 }, { "epoch": 11.338622984386998, "grad_norm": 2.170419692993164, "learning_rate": 1.399490198613174e-07, "loss": 1.9876, "num_input_tokens_seen": 586390136, "step": 575900 }, { "epoch": 11.3405918371365, "grad_norm": 1.8429392576217651, "learning_rate": 1.3980598838279212e-07, "loss": 1.9817, "num_input_tokens_seen": 586492536, "step": 576000 }, { "epoch": 11.342560689886003, "grad_norm": 2.134028673171997, "learning_rate": 1.3966301815222049e-07, "loss": 1.9608, "num_input_tokens_seen": 586594272, "step": 576100 }, { "epoch": 11.344529542635506, "grad_norm": 1.9560480117797852, "learning_rate": 1.3952010919391344e-07, "loss": 2.005, "num_input_tokens_seen": 586696672, "step": 576200 }, { "epoch": 11.346498395385009, "grad_norm": 1.8093209266662598, "learning_rate": 1.393772615321715e-07, "loss": 1.9469, "num_input_tokens_seen": 586799072, "step": 576300 }, { "epoch": 11.348467248134511, "grad_norm": 2.019213914871216, "learning_rate": 1.3923447519128438e-07, "loss": 2.0158, "num_input_tokens_seen": 586900176, "step": 576400 }, { "epoch": 11.350436100884014, "grad_norm": 1.845041036605835, "learning_rate": 1.3909175019553206e-07, "loss": 1.9645, "num_input_tokens_seen": 587002576, "step": 576500 }, { "epoch": 11.352404953633517, "grad_norm": 2.8705618381500244, "learning_rate": 1.3894908656918346e-07, "loss": 1.9708, "num_input_tokens_seen": 587104016, "step": 576600 }, { "epoch": 11.35437380638302, "grad_norm": 1.7919387817382812, "learning_rate": 1.3880648433649767e-07, "loss": 1.9898, "num_input_tokens_seen": 587205496, "step": 576700 }, { "epoch": 11.356342659132524, "grad_norm": 1.9043221473693848, "learning_rate": 1.3866394352172262e-07, "loss": 1.9464, "num_input_tokens_seen": 587307896, "step": 576800 }, { "epoch": 11.358311511882027, "grad_norm": 1.738218069076538, "learning_rate": 1.3852146414909626e-07, "loss": 1.9639, "num_input_tokens_seen": 587410296, "step": 576900 }, { "epoch": 11.36028036463153, "grad_norm": 2.0904452800750732, "learning_rate": 1.3837904624284613e-07, "loss": 1.9839, "num_input_tokens_seen": 587511824, "step": 577000 }, { "epoch": 11.362249217381033, "grad_norm": 1.8238475322723389, "learning_rate": 1.3823668982718912e-07, "loss": 1.923, "num_input_tokens_seen": 587614224, "step": 577100 }, { "epoch": 11.364218070130535, "grad_norm": 1.5709075927734375, "learning_rate": 1.380943949263319e-07, "loss": 1.999, "num_input_tokens_seen": 587716080, "step": 577200 }, { "epoch": 11.366186922880038, "grad_norm": 1.9707057476043701, "learning_rate": 1.379521615644702e-07, "loss": 1.9936, "num_input_tokens_seen": 587818480, "step": 577300 }, { "epoch": 11.36815577562954, "grad_norm": 2.1370668411254883, "learning_rate": 1.3780998976578984e-07, "loss": 1.9804, "num_input_tokens_seen": 587919928, "step": 577400 }, { "epoch": 11.370124628379044, "grad_norm": 3.453627347946167, "learning_rate": 1.376678795544658e-07, "loss": 1.981, "num_input_tokens_seen": 588022328, "step": 577500 }, { "epoch": 11.372093481128546, "grad_norm": 1.943253517150879, "learning_rate": 1.3752583095466296e-07, "loss": 2.0243, "num_input_tokens_seen": 588123872, "step": 577600 }, { "epoch": 11.374062333878049, "grad_norm": 1.6537712812423706, "learning_rate": 1.3738384399053504e-07, "loss": 1.987, "num_input_tokens_seen": 588226272, "step": 577700 }, { "epoch": 11.376031186627552, "grad_norm": 1.8856823444366455, "learning_rate": 1.3724191868622626e-07, "loss": 2.0012, "num_input_tokens_seen": 588327776, "step": 577800 }, { "epoch": 11.378000039377055, "grad_norm": 2.0332820415496826, "learning_rate": 1.3710005506586948e-07, "loss": 1.9756, "num_input_tokens_seen": 588429640, "step": 577900 }, { "epoch": 11.379968892126557, "grad_norm": 2.006032943725586, "learning_rate": 1.3695825315358745e-07, "loss": 1.966, "num_input_tokens_seen": 588532040, "step": 578000 }, { "epoch": 11.38193774487606, "grad_norm": 1.7524176836013794, "learning_rate": 1.368165129734924e-07, "loss": 1.9319, "num_input_tokens_seen": 588634440, "step": 578100 }, { "epoch": 11.383906597625563, "grad_norm": 2.0088958740234375, "learning_rate": 1.3667483454968614e-07, "loss": 1.9359, "num_input_tokens_seen": 588736840, "step": 578200 }, { "epoch": 11.385875450375066, "grad_norm": 1.8933031558990479, "learning_rate": 1.3653321790626e-07, "loss": 1.9927, "num_input_tokens_seen": 588838304, "step": 578300 }, { "epoch": 11.38784430312457, "grad_norm": 1.934854507446289, "learning_rate": 1.3639166306729432e-07, "loss": 1.9792, "num_input_tokens_seen": 588939856, "step": 578400 }, { "epoch": 11.389813155874073, "grad_norm": 1.7932626008987427, "learning_rate": 1.3625017005685958e-07, "loss": 2.031, "num_input_tokens_seen": 589040408, "step": 578500 }, { "epoch": 11.391782008623576, "grad_norm": 2.015106439590454, "learning_rate": 1.361087388990154e-07, "loss": 2.0541, "num_input_tokens_seen": 589141480, "step": 578600 }, { "epoch": 11.393750861373078, "grad_norm": 2.0154497623443604, "learning_rate": 1.3596736961781115e-07, "loss": 1.9962, "num_input_tokens_seen": 589243880, "step": 578700 }, { "epoch": 11.395719714122581, "grad_norm": 1.980150818824768, "learning_rate": 1.358260622372851e-07, "loss": 1.9847, "num_input_tokens_seen": 589345728, "step": 578800 }, { "epoch": 11.397688566872084, "grad_norm": 1.8823072910308838, "learning_rate": 1.356848167814656e-07, "loss": 2.0179, "num_input_tokens_seen": 589446256, "step": 578900 }, { "epoch": 11.399657419621587, "grad_norm": 2.0101981163024902, "learning_rate": 1.3554363327437023e-07, "loss": 1.9559, "num_input_tokens_seen": 589548656, "step": 579000 }, { "epoch": 11.40162627237109, "grad_norm": 4.090921878814697, "learning_rate": 1.3540251174000606e-07, "loss": 1.9844, "num_input_tokens_seen": 589649688, "step": 579100 }, { "epoch": 11.403595125120592, "grad_norm": 1.813073992729187, "learning_rate": 1.3526145220236973e-07, "loss": 2.0004, "num_input_tokens_seen": 589749616, "step": 579200 }, { "epoch": 11.405563977870095, "grad_norm": 2.2304625511169434, "learning_rate": 1.3512045468544703e-07, "loss": 2.0328, "num_input_tokens_seen": 589852016, "step": 579300 }, { "epoch": 11.407532830619598, "grad_norm": 2.0579187870025635, "learning_rate": 1.3497951921321337e-07, "loss": 2.0259, "num_input_tokens_seen": 589953456, "step": 579400 }, { "epoch": 11.4095016833691, "grad_norm": 2.0351145267486572, "learning_rate": 1.3483864580963384e-07, "loss": 2.0185, "num_input_tokens_seen": 590055088, "step": 579500 }, { "epoch": 11.411470536118603, "grad_norm": 2.1744699478149414, "learning_rate": 1.346978344986628e-07, "loss": 2.0111, "num_input_tokens_seen": 590157488, "step": 579600 }, { "epoch": 11.413439388868106, "grad_norm": 1.747322916984558, "learning_rate": 1.3455708530424375e-07, "loss": 1.9501, "num_input_tokens_seen": 590259344, "step": 579700 }, { "epoch": 11.415408241617609, "grad_norm": 2.6944944858551025, "learning_rate": 1.3441639825031003e-07, "loss": 2.0623, "num_input_tokens_seen": 590360504, "step": 579800 }, { "epoch": 11.417377094367112, "grad_norm": 2.2272725105285645, "learning_rate": 1.3427577336078431e-07, "loss": 1.9529, "num_input_tokens_seen": 590462464, "step": 579900 }, { "epoch": 11.419345947116614, "grad_norm": 2.331481695175171, "learning_rate": 1.3413521065957873e-07, "loss": 2.0399, "num_input_tokens_seen": 590564472, "step": 580000 }, { "epoch": 11.421314799866119, "grad_norm": 2.06408429145813, "learning_rate": 1.3399471017059483e-07, "loss": 2.0402, "num_input_tokens_seen": 590665696, "step": 580100 }, { "epoch": 11.423283652615622, "grad_norm": 2.0692265033721924, "learning_rate": 1.3385427191772315e-07, "loss": 2.0511, "num_input_tokens_seen": 590768096, "step": 580200 }, { "epoch": 11.425252505365124, "grad_norm": 1.6960629224777222, "learning_rate": 1.3371389592484461e-07, "loss": 1.9878, "num_input_tokens_seen": 590869752, "step": 580300 }, { "epoch": 11.427221358114627, "grad_norm": 2.9542627334594727, "learning_rate": 1.3357358221582854e-07, "loss": 1.9829, "num_input_tokens_seen": 590972152, "step": 580400 }, { "epoch": 11.42919021086413, "grad_norm": 1.9720667600631714, "learning_rate": 1.3343333081453434e-07, "loss": 1.9853, "num_input_tokens_seen": 591074552, "step": 580500 }, { "epoch": 11.431159063613633, "grad_norm": 1.8150570392608643, "learning_rate": 1.3329314174481022e-07, "loss": 1.9869, "num_input_tokens_seen": 591176088, "step": 580600 }, { "epoch": 11.433127916363135, "grad_norm": 1.9600450992584229, "learning_rate": 1.3315301503049464e-07, "loss": 1.9784, "num_input_tokens_seen": 591278480, "step": 580700 }, { "epoch": 11.435096769112638, "grad_norm": 1.910221815109253, "learning_rate": 1.3301295069541464e-07, "loss": 1.9742, "num_input_tokens_seen": 591380304, "step": 580800 }, { "epoch": 11.437065621862141, "grad_norm": 1.8686867952346802, "learning_rate": 1.32872948763387e-07, "loss": 1.9576, "num_input_tokens_seen": 591481872, "step": 580900 }, { "epoch": 11.439034474611644, "grad_norm": 1.9554502964019775, "learning_rate": 1.3273300925821795e-07, "loss": 1.982, "num_input_tokens_seen": 591583600, "step": 581000 }, { "epoch": 11.441003327361146, "grad_norm": 1.949562668800354, "learning_rate": 1.3259313220370298e-07, "loss": 2.0285, "num_input_tokens_seen": 591686000, "step": 581100 }, { "epoch": 11.44297218011065, "grad_norm": 2.226060628890991, "learning_rate": 1.3245331762362723e-07, "loss": 2.0074, "num_input_tokens_seen": 591788400, "step": 581200 }, { "epoch": 11.444941032860152, "grad_norm": 2.0835230350494385, "learning_rate": 1.323135655417646e-07, "loss": 2.0276, "num_input_tokens_seen": 591890496, "step": 581300 }, { "epoch": 11.446909885609655, "grad_norm": 2.0677332878112793, "learning_rate": 1.3217387598187895e-07, "loss": 2.017, "num_input_tokens_seen": 591991800, "step": 581400 }, { "epoch": 11.448878738359157, "grad_norm": 1.9477649927139282, "learning_rate": 1.320342489677233e-07, "loss": 1.9957, "num_input_tokens_seen": 592093008, "step": 581500 }, { "epoch": 11.45084759110866, "grad_norm": 1.973305583000183, "learning_rate": 1.318946845230402e-07, "loss": 1.9603, "num_input_tokens_seen": 592194024, "step": 581600 }, { "epoch": 11.452816443858163, "grad_norm": 1.8793494701385498, "learning_rate": 1.317551826715611e-07, "loss": 1.9894, "num_input_tokens_seen": 592295600, "step": 581700 }, { "epoch": 11.454785296607668, "grad_norm": 1.967320203781128, "learning_rate": 1.3161574343700727e-07, "loss": 1.9842, "num_input_tokens_seen": 592396872, "step": 581800 }, { "epoch": 11.45675414935717, "grad_norm": 2.103950023651123, "learning_rate": 1.3147636684308917e-07, "loss": 2.0282, "num_input_tokens_seen": 592498104, "step": 581900 }, { "epoch": 11.458723002106673, "grad_norm": 1.748271107673645, "learning_rate": 1.313370529135066e-07, "loss": 1.9633, "num_input_tokens_seen": 592597736, "step": 582000 }, { "epoch": 11.460691854856176, "grad_norm": 2.0629665851593018, "learning_rate": 1.3119780167194893e-07, "loss": 1.9863, "num_input_tokens_seen": 592700136, "step": 582100 }, { "epoch": 11.462660707605679, "grad_norm": 2.018004894256592, "learning_rate": 1.3105861314209426e-07, "loss": 1.9724, "num_input_tokens_seen": 592801664, "step": 582200 }, { "epoch": 11.464629560355181, "grad_norm": 2.0310120582580566, "learning_rate": 1.3091948734761065e-07, "loss": 2.0213, "num_input_tokens_seen": 592903792, "step": 582300 }, { "epoch": 11.466598413104684, "grad_norm": 6.117821216583252, "learning_rate": 1.3078042431215524e-07, "loss": 1.9787, "num_input_tokens_seen": 593005504, "step": 582400 }, { "epoch": 11.468567265854187, "grad_norm": 1.8119415044784546, "learning_rate": 1.3064142405937472e-07, "loss": 1.9885, "num_input_tokens_seen": 593106544, "step": 582500 }, { "epoch": 11.47053611860369, "grad_norm": 2.0592005252838135, "learning_rate": 1.3050248661290452e-07, "loss": 1.9836, "num_input_tokens_seen": 593208352, "step": 582600 }, { "epoch": 11.472504971353192, "grad_norm": 1.7434622049331665, "learning_rate": 1.3036361199637002e-07, "loss": 1.9239, "num_input_tokens_seen": 593310000, "step": 582700 }, { "epoch": 11.474473824102695, "grad_norm": 1.8289612531661987, "learning_rate": 1.3022480023338567e-07, "loss": 2.0132, "num_input_tokens_seen": 593411416, "step": 582800 }, { "epoch": 11.476442676852198, "grad_norm": 1.9406064748764038, "learning_rate": 1.300860513475553e-07, "loss": 1.9853, "num_input_tokens_seen": 593513064, "step": 582900 }, { "epoch": 11.4784115296017, "grad_norm": 2.0607354640960693, "learning_rate": 1.299473653624718e-07, "loss": 1.975, "num_input_tokens_seen": 593615464, "step": 583000 }, { "epoch": 11.480380382351203, "grad_norm": 1.8850548267364502, "learning_rate": 1.2980874230171751e-07, "loss": 1.9976, "num_input_tokens_seen": 593716144, "step": 583100 }, { "epoch": 11.482349235100706, "grad_norm": 2.0662503242492676, "learning_rate": 1.296701821888645e-07, "loss": 1.9811, "num_input_tokens_seen": 593818544, "step": 583200 }, { "epoch": 11.484318087850209, "grad_norm": 1.8697055578231812, "learning_rate": 1.295316850474734e-07, "loss": 1.9803, "num_input_tokens_seen": 593920944, "step": 583300 }, { "epoch": 11.486286940599712, "grad_norm": 1.941184401512146, "learning_rate": 1.293932509010947e-07, "loss": 1.9224, "num_input_tokens_seen": 594023344, "step": 583400 }, { "epoch": 11.488255793349216, "grad_norm": 2.114168405532837, "learning_rate": 1.2925487977326753e-07, "loss": 2.0006, "num_input_tokens_seen": 594124760, "step": 583500 }, { "epoch": 11.490224646098719, "grad_norm": 1.7260112762451172, "learning_rate": 1.2911657168752138e-07, "loss": 1.9692, "num_input_tokens_seen": 594226512, "step": 583600 }, { "epoch": 11.492193498848222, "grad_norm": 1.895065188407898, "learning_rate": 1.289783266673739e-07, "loss": 2.0289, "num_input_tokens_seen": 594328912, "step": 583700 }, { "epoch": 11.494162351597724, "grad_norm": 2.090946912765503, "learning_rate": 1.2884014473633276e-07, "loss": 1.9865, "num_input_tokens_seen": 594431312, "step": 583800 }, { "epoch": 11.496131204347227, "grad_norm": 1.916996955871582, "learning_rate": 1.287020259178942e-07, "loss": 1.9655, "num_input_tokens_seen": 594533712, "step": 583900 }, { "epoch": 11.49810005709673, "grad_norm": 1.820854902267456, "learning_rate": 1.285639702355446e-07, "loss": 1.969, "num_input_tokens_seen": 594636112, "step": 584000 }, { "epoch": 11.500068909846233, "grad_norm": 1.9672008752822876, "learning_rate": 1.284259777127592e-07, "loss": 1.9958, "num_input_tokens_seen": 594737096, "step": 584100 }, { "epoch": 11.502037762595736, "grad_norm": 2.0690979957580566, "learning_rate": 1.282880483730021e-07, "loss": 1.9662, "num_input_tokens_seen": 594839496, "step": 584200 }, { "epoch": 11.504006615345238, "grad_norm": 2.628361701965332, "learning_rate": 1.2815018223972728e-07, "loss": 1.9606, "num_input_tokens_seen": 594941448, "step": 584300 }, { "epoch": 11.505975468094741, "grad_norm": 2.009925365447998, "learning_rate": 1.280123793363776e-07, "loss": 2.028, "num_input_tokens_seen": 595043848, "step": 584400 }, { "epoch": 11.507944320844244, "grad_norm": 2.4728260040283203, "learning_rate": 1.2787463968638546e-07, "loss": 1.979, "num_input_tokens_seen": 595145456, "step": 584500 }, { "epoch": 11.509913173593747, "grad_norm": 1.9369709491729736, "learning_rate": 1.277369633131721e-07, "loss": 2.0121, "num_input_tokens_seen": 595246472, "step": 584600 }, { "epoch": 11.51188202634325, "grad_norm": 4.695319652557373, "learning_rate": 1.2759935024014822e-07, "loss": 2.0076, "num_input_tokens_seen": 595348120, "step": 584700 }, { "epoch": 11.513850879092752, "grad_norm": 1.899884819984436, "learning_rate": 1.2746180049071392e-07, "loss": 1.9597, "num_input_tokens_seen": 595450520, "step": 584800 }, { "epoch": 11.515819731842255, "grad_norm": 1.9921207427978516, "learning_rate": 1.273243140882585e-07, "loss": 1.9856, "num_input_tokens_seen": 595552144, "step": 584900 }, { "epoch": 11.517788584591758, "grad_norm": 1.755434513092041, "learning_rate": 1.2718689105616e-07, "loss": 1.9415, "num_input_tokens_seen": 595654544, "step": 585000 }, { "epoch": 11.519757437341262, "grad_norm": 3.659200668334961, "learning_rate": 3.94675918389431e-09, "loss": 1.9812, "num_input_tokens_seen": 595756448, "step": 585100 }, { "epoch": 11.521726290090765, "grad_norm": 2.1249501705169678, "learning_rate": 3.914506629807912e-09, "loss": 2.0162, "num_input_tokens_seen": 595858648, "step": 585200 }, { "epoch": 11.523695142840268, "grad_norm": 2.6715171337127686, "learning_rate": 3.882385881513572e-09, "loss": 1.9694, "num_input_tokens_seen": 595960224, "step": 585300 }, { "epoch": 11.52566399558977, "grad_norm": 1.8062368631362915, "learning_rate": 3.8503969475454645e-09, "loss": 1.9692, "num_input_tokens_seen": 596059680, "step": 585400 }, { "epoch": 11.527632848339273, "grad_norm": 1.8949953317642212, "learning_rate": 3.818539836402901e-09, "loss": 1.9634, "num_input_tokens_seen": 596160920, "step": 585500 }, { "epoch": 11.529601701088776, "grad_norm": 1.8594651222229004, "learning_rate": 3.786814556549889e-09, "loss": 2.0249, "num_input_tokens_seen": 596262168, "step": 585600 }, { "epoch": 11.531570553838279, "grad_norm": 1.5639607906341553, "learning_rate": 3.755221116415685e-09, "loss": 1.9759, "num_input_tokens_seen": 596364568, "step": 585700 }, { "epoch": 11.533539406587781, "grad_norm": 1.8152952194213867, "learning_rate": 3.7237595243944097e-09, "loss": 2.0019, "num_input_tokens_seen": 596466344, "step": 585800 }, { "epoch": 11.535508259337284, "grad_norm": 2.1235945224761963, "learning_rate": 3.6924297888450974e-09, "loss": 1.98, "num_input_tokens_seen": 596568744, "step": 585900 }, { "epoch": 11.537477112086787, "grad_norm": 1.9921948909759521, "learning_rate": 3.661231918091756e-09, "loss": 1.9993, "num_input_tokens_seen": 596670520, "step": 586000 }, { "epoch": 11.53944596483629, "grad_norm": 2.057854652404785, "learning_rate": 3.6301659204235335e-09, "loss": 1.9438, "num_input_tokens_seen": 596771432, "step": 586100 }, { "epoch": 11.541414817585792, "grad_norm": 2.0799715518951416, "learning_rate": 3.5992318040942714e-09, "loss": 2.0, "num_input_tokens_seen": 596873104, "step": 586200 }, { "epoch": 11.543383670335295, "grad_norm": 2.0648226737976074, "learning_rate": 3.5684295773230066e-09, "loss": 2.0493, "num_input_tokens_seen": 596974128, "step": 586300 }, { "epoch": 11.545352523084798, "grad_norm": 1.8234760761260986, "learning_rate": 3.5377592482935816e-09, "loss": 1.9846, "num_input_tokens_seen": 597076528, "step": 586400 }, { "epoch": 11.5473213758343, "grad_norm": 1.8485310077667236, "learning_rate": 3.5072208251548665e-09, "loss": 1.9597, "num_input_tokens_seen": 597178928, "step": 586500 }, { "epoch": 11.549290228583803, "grad_norm": 1.9488697052001953, "learning_rate": 3.4768143160206486e-09, "loss": 2.0429, "num_input_tokens_seen": 597280600, "step": 586600 }, { "epoch": 11.551259081333306, "grad_norm": 2.2161998748779297, "learning_rate": 3.4465397289696884e-09, "loss": 1.9481, "num_input_tokens_seen": 597383000, "step": 586700 }, { "epoch": 11.553227934082809, "grad_norm": 2.0033934116363525, "learning_rate": 3.416397072045718e-09, "loss": 1.9765, "num_input_tokens_seen": 597484064, "step": 586800 }, { "epoch": 11.555196786832314, "grad_norm": 1.6994107961654663, "learning_rate": 3.386386353257387e-09, "loss": 2.05, "num_input_tokens_seen": 597586464, "step": 586900 }, { "epoch": 11.557165639581816, "grad_norm": 1.991618037223816, "learning_rate": 3.3565075805782605e-09, "loss": 2.0157, "num_input_tokens_seen": 597686616, "step": 587000 }, { "epoch": 11.559134492331319, "grad_norm": 1.9513188600540161, "learning_rate": 3.326760761946934e-09, "loss": 2.0172, "num_input_tokens_seen": 597787712, "step": 587100 }, { "epoch": 11.561103345080822, "grad_norm": 1.7938748598098755, "learning_rate": 3.2971459052668627e-09, "loss": 1.9731, "num_input_tokens_seen": 597890112, "step": 587200 }, { "epoch": 11.563072197830325, "grad_norm": 2.0966484546661377, "learning_rate": 3.2676630184064193e-09, "loss": 2.021, "num_input_tokens_seen": 597991024, "step": 587300 }, { "epoch": 11.565041050579827, "grad_norm": 2.132781505584717, "learning_rate": 3.238312109199004e-09, "loss": 1.9729, "num_input_tokens_seen": 598093424, "step": 587400 }, { "epoch": 11.56700990332933, "grad_norm": 1.893375277519226, "learning_rate": 3.2090931854429347e-09, "loss": 2.0153, "num_input_tokens_seen": 598195824, "step": 587500 }, { "epoch": 11.568978756078833, "grad_norm": 2.136014223098755, "learning_rate": 3.1800062549013906e-09, "loss": 1.9696, "num_input_tokens_seen": 598296504, "step": 587600 }, { "epoch": 11.570947608828336, "grad_norm": 1.9575252532958984, "learning_rate": 3.151051325302523e-09, "loss": 1.9419, "num_input_tokens_seen": 598398328, "step": 587700 }, { "epoch": 11.572916461577838, "grad_norm": 1.770530343055725, "learning_rate": 3.122228404339511e-09, "loss": 2.0142, "num_input_tokens_seen": 598500728, "step": 587800 }, { "epoch": 11.574885314327341, "grad_norm": 2.0865249633789062, "learning_rate": 3.0935374996702292e-09, "loss": 2.0028, "num_input_tokens_seen": 598602488, "step": 587900 }, { "epoch": 11.576854167076844, "grad_norm": 2.0338327884674072, "learning_rate": 3.0649786189176908e-09, "loss": 1.98, "num_input_tokens_seen": 598703832, "step": 588000 }, { "epoch": 11.578823019826347, "grad_norm": 1.9522576332092285, "learning_rate": 3.0365517696697705e-09, "loss": 2.0216, "num_input_tokens_seen": 598805000, "step": 588100 }, { "epoch": 11.58079187257585, "grad_norm": 2.122528314590454, "learning_rate": 3.0082569594792052e-09, "loss": 1.9755, "num_input_tokens_seen": 598906680, "step": 588200 }, { "epoch": 11.582760725325352, "grad_norm": 1.6766784191131592, "learning_rate": 2.9800941958637027e-09, "loss": 2.0083, "num_input_tokens_seen": 599006296, "step": 588300 }, { "epoch": 11.584729578074857, "grad_norm": 2.1729745864868164, "learning_rate": 2.9520634863058336e-09, "loss": 2.0161, "num_input_tokens_seen": 599106608, "step": 588400 }, { "epoch": 11.58669843082436, "grad_norm": 2.253821611404419, "learning_rate": 2.924164838253196e-09, "loss": 2.0157, "num_input_tokens_seen": 599208192, "step": 588500 }, { "epoch": 11.588667283573862, "grad_norm": 1.7080379724502563, "learning_rate": 2.8963982591182488e-09, "loss": 1.9792, "num_input_tokens_seen": 599309664, "step": 588600 }, { "epoch": 11.590636136323365, "grad_norm": 1.9355628490447998, "learning_rate": 2.868763756278203e-09, "loss": 1.9482, "num_input_tokens_seen": 599411336, "step": 588700 }, { "epoch": 11.592604989072868, "grad_norm": 1.8168165683746338, "learning_rate": 2.8412613370754624e-09, "loss": 2.0295, "num_input_tokens_seen": 599513736, "step": 588800 }, { "epoch": 11.59457384182237, "grad_norm": 1.9750293493270874, "learning_rate": 2.8138910088170708e-09, "loss": 1.9778, "num_input_tokens_seen": 599616136, "step": 588900 }, { "epoch": 11.596542694571873, "grad_norm": 2.0568883419036865, "learning_rate": 2.786652778775267e-09, "loss": 2.0103, "num_input_tokens_seen": 599718536, "step": 589000 }, { "epoch": 11.598511547321376, "grad_norm": 1.8482158184051514, "learning_rate": 2.7595466541868175e-09, "loss": 1.9449, "num_input_tokens_seen": 599820168, "step": 589100 }, { "epoch": 11.600480400070879, "grad_norm": 2.0327420234680176, "learning_rate": 2.7325726422536833e-09, "loss": 2.011, "num_input_tokens_seen": 599922480, "step": 589200 }, { "epoch": 11.602449252820382, "grad_norm": 1.763657569885254, "learning_rate": 2.705730750142743e-09, "loss": 1.9304, "num_input_tokens_seen": 600023624, "step": 589300 }, { "epoch": 11.604418105569884, "grad_norm": 1.5697556734085083, "learning_rate": 2.6790209849854584e-09, "loss": 1.9709, "num_input_tokens_seen": 600125280, "step": 589400 }, { "epoch": 11.606386958319387, "grad_norm": 1.759230136871338, "learning_rate": 2.6524433538785418e-09, "loss": 1.9486, "num_input_tokens_seen": 600227680, "step": 589500 }, { "epoch": 11.60835581106889, "grad_norm": 1.822941780090332, "learning_rate": 2.6259978638834e-09, "loss": 1.9941, "num_input_tokens_seen": 600328576, "step": 589600 }, { "epoch": 11.610324663818393, "grad_norm": 1.6592353582382202, "learning_rate": 2.599684522026413e-09, "loss": 1.9774, "num_input_tokens_seen": 600430928, "step": 589700 }, { "epoch": 11.612293516567895, "grad_norm": 2.0996155738830566, "learning_rate": 2.5735033352987656e-09, "loss": 1.9954, "num_input_tokens_seen": 600532056, "step": 589800 }, { "epoch": 11.614262369317398, "grad_norm": 1.7824513912200928, "learning_rate": 2.547454310656616e-09, "loss": 2.0438, "num_input_tokens_seen": 600632248, "step": 589900 }, { "epoch": 11.6162312220669, "grad_norm": 1.7518781423568726, "learning_rate": 2.521537455020928e-09, "loss": 2.0343, "num_input_tokens_seen": 600733752, "step": 590000 }, { "epoch": 11.618200074816404, "grad_norm": 2.099236011505127, "learning_rate": 2.4957527752776376e-09, "loss": 2.0148, "num_input_tokens_seen": 600835576, "step": 590100 }, { "epoch": 11.620168927565908, "grad_norm": 1.8308309316635132, "learning_rate": 2.470100278277487e-09, "loss": 1.9398, "num_input_tokens_seen": 600937968, "step": 590200 }, { "epoch": 11.622137780315411, "grad_norm": 1.9775854349136353, "learning_rate": 2.4445799708361914e-09, "loss": 2.0115, "num_input_tokens_seen": 601038016, "step": 590300 }, { "epoch": 11.624106633064914, "grad_norm": 2.018115997314453, "learning_rate": 2.419191859734271e-09, "loss": 1.9977, "num_input_tokens_seen": 601140416, "step": 590400 }, { "epoch": 11.626075485814416, "grad_norm": 1.8581650257110596, "learning_rate": 2.393935951717052e-09, "loss": 1.979, "num_input_tokens_seen": 601241464, "step": 590500 }, { "epoch": 11.62804433856392, "grad_norm": 1.8789161443710327, "learning_rate": 2.368812253494945e-09, "loss": 2.0179, "num_input_tokens_seen": 601343864, "step": 590600 }, { "epoch": 11.630013191313422, "grad_norm": 2.103886842727661, "learning_rate": 2.343820771742999e-09, "loss": 1.9422, "num_input_tokens_seen": 601445440, "step": 590700 }, { "epoch": 11.631982044062925, "grad_norm": 2.323665142059326, "learning_rate": 2.318961513101292e-09, "loss": 1.971, "num_input_tokens_seen": 601547200, "step": 590800 }, { "epoch": 11.633950896812427, "grad_norm": 1.7970967292785645, "learning_rate": 2.294234484174762e-09, "loss": 1.99, "num_input_tokens_seen": 601649600, "step": 590900 }, { "epoch": 11.63591974956193, "grad_norm": 1.786669373512268, "learning_rate": 2.269639691533043e-09, "loss": 2.0106, "num_input_tokens_seen": 601751416, "step": 591000 }, { "epoch": 11.637888602311433, "grad_norm": 2.2270216941833496, "learning_rate": 2.2451771417109077e-09, "loss": 1.9897, "num_input_tokens_seen": 601853120, "step": 591100 }, { "epoch": 11.639857455060936, "grad_norm": 1.8334543704986572, "learning_rate": 2.220846841207824e-09, "loss": 2.0037, "num_input_tokens_seen": 601955520, "step": 591200 }, { "epoch": 11.641826307810438, "grad_norm": 1.9184553623199463, "learning_rate": 2.1966487964880652e-09, "loss": 1.9609, "num_input_tokens_seen": 602057920, "step": 591300 }, { "epoch": 11.643795160559941, "grad_norm": 2.007113218307495, "learning_rate": 2.172583013980933e-09, "loss": 2.0072, "num_input_tokens_seen": 602159032, "step": 591400 }, { "epoch": 11.645764013309444, "grad_norm": 2.028715133666992, "learning_rate": 2.1486495000804796e-09, "loss": 1.9598, "num_input_tokens_seen": 602261432, "step": 591500 }, { "epoch": 11.647732866058947, "grad_norm": 1.79324209690094, "learning_rate": 2.124848261145673e-09, "loss": 1.9845, "num_input_tokens_seen": 602363384, "step": 591600 }, { "epoch": 11.64970171880845, "grad_norm": 1.9027572870254517, "learning_rate": 2.101179303500289e-09, "loss": 1.9423, "num_input_tokens_seen": 602465536, "step": 591700 }, { "epoch": 11.651670571557954, "grad_norm": 1.9445005655288696, "learning_rate": 2.0776426334329077e-09, "loss": 1.9774, "num_input_tokens_seen": 602567936, "step": 591800 }, { "epoch": 11.653639424307457, "grad_norm": 1.8726835250854492, "learning_rate": 2.0542382571971384e-09, "loss": 2.0093, "num_input_tokens_seen": 602669528, "step": 591900 }, { "epoch": 11.65560827705696, "grad_norm": 1.7551376819610596, "learning_rate": 2.0309661810112287e-09, "loss": 1.9717, "num_input_tokens_seen": 602771928, "step": 592000 }, { "epoch": 11.657577129806462, "grad_norm": 1.9877421855926514, "learning_rate": 2.0078264110584e-09, "loss": 1.9794, "num_input_tokens_seen": 602874328, "step": 592100 }, { "epoch": 11.659545982555965, "grad_norm": 1.8482903242111206, "learning_rate": 1.9848189534867332e-09, "loss": 1.9593, "num_input_tokens_seen": 602976112, "step": 592200 }, { "epoch": 11.661514835305468, "grad_norm": 2.0622079372406006, "learning_rate": 1.9619438144091173e-09, "loss": 1.9501, "num_input_tokens_seen": 603078512, "step": 592300 }, { "epoch": 11.66348368805497, "grad_norm": 1.8774306774139404, "learning_rate": 1.9392009999032455e-09, "loss": 1.9638, "num_input_tokens_seen": 603180912, "step": 592400 }, { "epoch": 11.665452540804473, "grad_norm": 2.006685972213745, "learning_rate": 1.916590516011729e-09, "loss": 1.9802, "num_input_tokens_seen": 603282784, "step": 592500 }, { "epoch": 11.667421393553976, "grad_norm": 1.8034919500350952, "learning_rate": 1.8941123687419292e-09, "loss": 1.9731, "num_input_tokens_seen": 603385184, "step": 592600 }, { "epoch": 11.669390246303479, "grad_norm": 1.901477575302124, "learning_rate": 1.871766564066235e-09, "loss": 1.9767, "num_input_tokens_seen": 603487584, "step": 592700 }, { "epoch": 11.671359099052982, "grad_norm": 1.9936275482177734, "learning_rate": 1.8495531079216199e-09, "loss": 1.9927, "num_input_tokens_seen": 603589520, "step": 592800 }, { "epoch": 11.673327951802484, "grad_norm": 2.0637118816375732, "learning_rate": 1.827472006210029e-09, "loss": 1.9931, "num_input_tokens_seen": 603690792, "step": 592900 }, { "epoch": 11.675296804551987, "grad_norm": 1.935971736907959, "learning_rate": 1.8055232647982698e-09, "loss": 1.9789, "num_input_tokens_seen": 603793192, "step": 593000 }, { "epoch": 11.67726565730149, "grad_norm": 2.0571725368499756, "learning_rate": 1.7837068895178997e-09, "loss": 2.0209, "num_input_tokens_seen": 603894648, "step": 593100 }, { "epoch": 11.679234510050993, "grad_norm": 2.2053656578063965, "learning_rate": 1.7620228861653374e-09, "loss": 1.9245, "num_input_tokens_seen": 603997000, "step": 593200 }, { "epoch": 11.681203362800495, "grad_norm": 1.9412939548492432, "learning_rate": 1.740471260501919e-09, "loss": 2.0252, "num_input_tokens_seen": 604099400, "step": 593300 }, { "epoch": 11.683172215549998, "grad_norm": 2.307305097579956, "learning_rate": 1.719052018253675e-09, "loss": 2.0151, "num_input_tokens_seen": 604201128, "step": 593400 }, { "epoch": 11.685141068299501, "grad_norm": 1.7656645774841309, "learning_rate": 1.6977651651114976e-09, "loss": 1.9501, "num_input_tokens_seen": 604302984, "step": 593500 }, { "epoch": 11.687109921049005, "grad_norm": 2.1728756427764893, "learning_rate": 1.6766107067311964e-09, "loss": 2.0443, "num_input_tokens_seen": 604404160, "step": 593600 }, { "epoch": 11.689078773798508, "grad_norm": 2.0321216583251953, "learning_rate": 1.6555886487332749e-09, "loss": 1.9772, "num_input_tokens_seen": 604505368, "step": 593700 }, { "epoch": 11.691047626548011, "grad_norm": 1.8394025564193726, "learning_rate": 1.6346989967031545e-09, "loss": 1.993, "num_input_tokens_seen": 604607768, "step": 593800 }, { "epoch": 11.693016479297514, "grad_norm": 6.243096351623535, "learning_rate": 1.6139417561910063e-09, "loss": 2.0026, "num_input_tokens_seen": 604709664, "step": 593900 }, { "epoch": 11.694985332047017, "grad_norm": 1.8713412284851074, "learning_rate": 1.5933169327119189e-09, "loss": 2.0245, "num_input_tokens_seen": 604809960, "step": 594000 }, { "epoch": 11.69695418479652, "grad_norm": 2.0375311374664307, "learning_rate": 1.5728245317456756e-09, "loss": 2.0352, "num_input_tokens_seen": 604912360, "step": 594100 }, { "epoch": 11.698923037546022, "grad_norm": 2.0358030796051025, "learning_rate": 1.5524645587369768e-09, "loss": 1.9875, "num_input_tokens_seen": 605014080, "step": 594200 }, { "epoch": 11.700891890295525, "grad_norm": 2.111898899078369, "learning_rate": 1.5322370190952727e-09, "loss": 1.9973, "num_input_tokens_seen": 605116480, "step": 594300 }, { "epoch": 11.702860743045028, "grad_norm": 2.085745334625244, "learning_rate": 1.5121419181948204e-09, "loss": 2.0091, "num_input_tokens_seen": 605218016, "step": 594400 }, { "epoch": 11.70482959579453, "grad_norm": 2.052598237991333, "learning_rate": 1.492179261374793e-09, "loss": 2.0294, "num_input_tokens_seen": 605319816, "step": 594500 }, { "epoch": 11.706798448544033, "grad_norm": 1.9992872476577759, "learning_rate": 1.4723490539390592e-09, "loss": 1.9653, "num_input_tokens_seen": 605421400, "step": 594600 }, { "epoch": 11.708767301293536, "grad_norm": 1.8020381927490234, "learning_rate": 1.4526513011564046e-09, "loss": 1.9635, "num_input_tokens_seen": 605523800, "step": 594700 }, { "epoch": 11.710736154043039, "grad_norm": 1.9601142406463623, "learning_rate": 1.4330860082602535e-09, "loss": 1.9859, "num_input_tokens_seen": 605626200, "step": 594800 }, { "epoch": 11.712705006792541, "grad_norm": 1.6479361057281494, "learning_rate": 1.4136531804490036e-09, "loss": 1.9933, "num_input_tokens_seen": 605728600, "step": 594900 }, { "epoch": 11.714673859542044, "grad_norm": 2.1334123611450195, "learning_rate": 1.3943528228858026e-09, "loss": 1.9638, "num_input_tokens_seen": 605831000, "step": 595000 }, { "epoch": 11.716642712291549, "grad_norm": 1.9267786741256714, "learning_rate": 1.3751849406985483e-09, "loss": 2.022, "num_input_tokens_seen": 605932824, "step": 595100 }, { "epoch": 11.718611565041051, "grad_norm": 1.9408087730407715, "learning_rate": 1.3561495389800004e-09, "loss": 1.9692, "num_input_tokens_seen": 606035224, "step": 595200 }, { "epoch": 11.720580417790554, "grad_norm": 1.870067834854126, "learning_rate": 1.3372466227877243e-09, "loss": 1.9671, "num_input_tokens_seen": 606136760, "step": 595300 }, { "epoch": 11.722549270540057, "grad_norm": 1.7808547019958496, "learning_rate": 1.3184761971440917e-09, "loss": 1.984, "num_input_tokens_seen": 606239160, "step": 595400 }, { "epoch": 11.72451812328956, "grad_norm": 1.933311104774475, "learning_rate": 1.299838267036224e-09, "loss": 1.9937, "num_input_tokens_seen": 606338368, "step": 595500 }, { "epoch": 11.726486976039062, "grad_norm": 1.8346141576766968, "learning_rate": 1.2813328374161047e-09, "loss": 1.9848, "num_input_tokens_seen": 606440768, "step": 595600 }, { "epoch": 11.728455828788565, "grad_norm": 2.006956100463867, "learning_rate": 1.2629599132003566e-09, "loss": 1.9859, "num_input_tokens_seen": 606541504, "step": 595700 }, { "epoch": 11.730424681538068, "grad_norm": 1.6994996070861816, "learning_rate": 1.24471949927063e-09, "loss": 2.0021, "num_input_tokens_seen": 606642144, "step": 595800 }, { "epoch": 11.73239353428757, "grad_norm": 2.083919048309326, "learning_rate": 1.2266116004731598e-09, "loss": 1.9589, "num_input_tokens_seen": 606744544, "step": 595900 }, { "epoch": 11.734362387037073, "grad_norm": 2.079982042312622, "learning_rate": 1.2086362216191525e-09, "loss": 2.0034, "num_input_tokens_seen": 606846352, "step": 596000 }, { "epoch": 11.736331239786576, "grad_norm": 1.7823718786239624, "learning_rate": 1.1907933674845106e-09, "loss": 1.9454, "num_input_tokens_seen": 606947992, "step": 596100 }, { "epoch": 11.738300092536079, "grad_norm": 1.944311499595642, "learning_rate": 1.173083042809886e-09, "loss": 1.9692, "num_input_tokens_seen": 607050392, "step": 596200 }, { "epoch": 11.740268945285582, "grad_norm": 2.179417848587036, "learning_rate": 1.155505252300737e-09, "loss": 1.9869, "num_input_tokens_seen": 607152792, "step": 596300 }, { "epoch": 11.742237798035084, "grad_norm": 1.756500244140625, "learning_rate": 1.1380600006273833e-09, "loss": 1.9897, "num_input_tokens_seen": 607254336, "step": 596400 }, { "epoch": 11.744206650784587, "grad_norm": 2.1323633193969727, "learning_rate": 1.120747292424895e-09, "loss": 1.9508, "num_input_tokens_seen": 607356072, "step": 596500 }, { "epoch": 11.74617550353409, "grad_norm": 2.6009182929992676, "learning_rate": 1.103567132293093e-09, "loss": 2.0472, "num_input_tokens_seen": 607458472, "step": 596600 }, { "epoch": 11.748144356283593, "grad_norm": 1.9170985221862793, "learning_rate": 1.0865195247966585e-09, "loss": 2.0183, "num_input_tokens_seen": 607560872, "step": 596700 }, { "epoch": 11.750113209033096, "grad_norm": 1.8585846424102783, "learning_rate": 1.0696044744649135e-09, "loss": 1.9619, "num_input_tokens_seen": 607663272, "step": 596800 }, { "epoch": 11.7520820617826, "grad_norm": 2.1044414043426514, "learning_rate": 1.0528219857920407e-09, "loss": 1.9705, "num_input_tokens_seen": 607765672, "step": 596900 }, { "epoch": 11.754050914532103, "grad_norm": 1.7990201711654663, "learning_rate": 1.036172063237084e-09, "loss": 1.9639, "num_input_tokens_seen": 607868072, "step": 597000 }, { "epoch": 11.756019767281606, "grad_norm": 1.767758846282959, "learning_rate": 1.0196547112237275e-09, "loss": 2.018, "num_input_tokens_seen": 607970472, "step": 597100 }, { "epoch": 11.757988620031108, "grad_norm": 1.8967418670654297, "learning_rate": 1.0032699341405716e-09, "loss": 1.9725, "num_input_tokens_seen": 608071744, "step": 597200 }, { "epoch": 11.759957472780611, "grad_norm": 2.6396100521087646, "learning_rate": 9.870177363408005e-10, "loss": 1.972, "num_input_tokens_seen": 608174144, "step": 597300 }, { "epoch": 11.761926325530114, "grad_norm": 1.9669471979141235, "learning_rate": 9.70898122142627e-10, "loss": 1.9493, "num_input_tokens_seen": 608276544, "step": 597400 }, { "epoch": 11.763895178279617, "grad_norm": 1.7169256210327148, "learning_rate": 9.549110958287365e-10, "loss": 1.9629, "num_input_tokens_seen": 608378944, "step": 597500 }, { "epoch": 11.76586403102912, "grad_norm": 2.1224753856658936, "learning_rate": 9.390566616468976e-10, "loss": 2.0116, "num_input_tokens_seen": 608480472, "step": 597600 }, { "epoch": 11.767832883778622, "grad_norm": 1.8127522468566895, "learning_rate": 9.23334823809463e-10, "loss": 1.9794, "num_input_tokens_seen": 608582048, "step": 597700 }, { "epoch": 11.769801736528125, "grad_norm": 1.8179248571395874, "learning_rate": 9.077455864934803e-10, "loss": 2.0005, "num_input_tokens_seen": 608682832, "step": 597800 }, { "epoch": 11.771770589277628, "grad_norm": 1.9912502765655518, "learning_rate": 8.92288953841025e-10, "loss": 2.0201, "num_input_tokens_seen": 608785232, "step": 597900 }, { "epoch": 11.77373944202713, "grad_norm": 1.807316780090332, "learning_rate": 8.769649299587568e-10, "loss": 2.0112, "num_input_tokens_seen": 608887632, "step": 598000 }, { "epoch": 11.775708294776633, "grad_norm": 2.461653470993042, "learning_rate": 8.617735189181408e-10, "loss": 1.983, "num_input_tokens_seen": 608988488, "step": 598100 }, { "epoch": 11.777677147526136, "grad_norm": 2.1969923973083496, "learning_rate": 8.467147247553375e-10, "loss": 1.9361, "num_input_tokens_seen": 609090216, "step": 598200 }, { "epoch": 11.779646000275639, "grad_norm": 1.9241276979446411, "learning_rate": 8.317885514714241e-10, "loss": 1.95, "num_input_tokens_seen": 609192616, "step": 598300 }, { "epoch": 11.781614853025141, "grad_norm": 1.9649975299835205, "learning_rate": 8.169950030321726e-10, "loss": 1.9692, "num_input_tokens_seen": 609294416, "step": 598400 }, { "epoch": 11.783583705774646, "grad_norm": 2.0693068504333496, "learning_rate": 8.023340833679948e-10, "loss": 1.9561, "num_input_tokens_seen": 609396352, "step": 598500 }, { "epoch": 11.785552558524149, "grad_norm": 1.730277180671692, "learning_rate": 7.878057963743301e-10, "loss": 1.9669, "num_input_tokens_seen": 609498752, "step": 598600 }, { "epoch": 11.787521411273651, "grad_norm": 1.7669689655303955, "learning_rate": 7.734101459110909e-10, "loss": 2.0171, "num_input_tokens_seen": 609601152, "step": 598700 }, { "epoch": 11.789490264023154, "grad_norm": 2.171191453933716, "learning_rate": 7.59147135803162e-10, "loss": 1.9596, "num_input_tokens_seen": 609702952, "step": 598800 }, { "epoch": 11.791459116772657, "grad_norm": 1.9326591491699219, "learning_rate": 7.450167698400678e-10, "loss": 1.9839, "num_input_tokens_seen": 609804584, "step": 598900 }, { "epoch": 11.79342796952216, "grad_norm": 1.955987811088562, "learning_rate": 7.310190517761383e-10, "loss": 1.967, "num_input_tokens_seen": 609905832, "step": 599000 }, { "epoch": 11.795396822271663, "grad_norm": 1.9000886678695679, "learning_rate": 7.171539853304542e-10, "loss": 1.959, "num_input_tokens_seen": 610007656, "step": 599100 }, { "epoch": 11.797365675021165, "grad_norm": 2.1839611530303955, "learning_rate": 7.034215741868465e-10, "loss": 1.9685, "num_input_tokens_seen": 610109456, "step": 599200 }, { "epoch": 11.799334527770668, "grad_norm": 1.9134931564331055, "learning_rate": 6.898218219938967e-10, "loss": 1.9602, "num_input_tokens_seen": 610211088, "step": 599300 }, { "epoch": 11.80130338052017, "grad_norm": 1.989763617515564, "learning_rate": 6.763547323649921e-10, "loss": 1.9667, "num_input_tokens_seen": 610312904, "step": 599400 }, { "epoch": 11.803272233269674, "grad_norm": 2.162590742111206, "learning_rate": 6.630203088781594e-10, "loss": 1.9997, "num_input_tokens_seen": 610414856, "step": 599500 }, { "epoch": 11.805241086019176, "grad_norm": 2.458364248275757, "learning_rate": 6.49818555076287e-10, "loss": 1.9776, "num_input_tokens_seen": 610514968, "step": 599600 }, { "epoch": 11.807209938768679, "grad_norm": 1.8544301986694336, "learning_rate": 6.367494744669577e-10, "loss": 1.937, "num_input_tokens_seen": 610617368, "step": 599700 }, { "epoch": 11.809178791518182, "grad_norm": 1.9206244945526123, "learning_rate": 6.238130705225053e-10, "loss": 1.9884, "num_input_tokens_seen": 610719000, "step": 599800 }, { "epoch": 11.811147644267685, "grad_norm": 1.8682975769042969, "learning_rate": 6.110093466800137e-10, "loss": 1.9506, "num_input_tokens_seen": 610820752, "step": 599900 }, { "epoch": 11.813116497017187, "grad_norm": 1.937517523765564, "learning_rate": 5.983383063413728e-10, "loss": 1.9925, "num_input_tokens_seen": 610923152, "step": 600000 }, { "epoch": 11.81508534976669, "grad_norm": 1.77732515335083, "learning_rate": 5.857999528731672e-10, "loss": 2.0176, "num_input_tokens_seen": 611024096, "step": 600100 }, { "epoch": 11.817054202516193, "grad_norm": 1.8825489282608032, "learning_rate": 5.733942896066768e-10, "loss": 1.977, "num_input_tokens_seen": 611126496, "step": 600200 }, { "epoch": 11.819023055265697, "grad_norm": 2.0327060222625732, "learning_rate": 5.611213198380427e-10, "loss": 2.0058, "num_input_tokens_seen": 611228088, "step": 600300 }, { "epoch": 11.8209919080152, "grad_norm": 2.0941362380981445, "learning_rate": 5.489810468280454e-10, "loss": 1.948, "num_input_tokens_seen": 611329448, "step": 600400 }, { "epoch": 11.822960760764703, "grad_norm": 2.052814483642578, "learning_rate": 5.369734738023269e-10, "loss": 1.9882, "num_input_tokens_seen": 611431072, "step": 600500 }, { "epoch": 11.824929613514206, "grad_norm": 1.8776636123657227, "learning_rate": 5.250986039511129e-10, "loss": 1.9543, "num_input_tokens_seen": 611533040, "step": 600600 }, { "epoch": 11.826898466263708, "grad_norm": 1.789001226425171, "learning_rate": 5.13356440429491e-10, "loss": 2.0162, "num_input_tokens_seen": 611633952, "step": 600700 }, { "epoch": 11.828867319013211, "grad_norm": 1.8962733745574951, "learning_rate": 5.017469863572987e-10, "loss": 1.97, "num_input_tokens_seen": 611736264, "step": 600800 }, { "epoch": 11.830836171762714, "grad_norm": 2.056675910949707, "learning_rate": 4.902702448190133e-10, "loss": 1.9556, "num_input_tokens_seen": 611838664, "step": 600900 }, { "epoch": 11.832805024512217, "grad_norm": 2.016049861907959, "learning_rate": 4.789262188639176e-10, "loss": 1.9164, "num_input_tokens_seen": 611940568, "step": 601000 }, { "epoch": 11.83477387726172, "grad_norm": 1.9803416728973389, "learning_rate": 4.677149115059897e-10, "loss": 2.0214, "num_input_tokens_seen": 612042136, "step": 601100 }, { "epoch": 11.836742730011222, "grad_norm": 2.1229701042175293, "learning_rate": 4.5663632572412456e-10, "loss": 1.9946, "num_input_tokens_seen": 612144536, "step": 601200 }, { "epoch": 11.838711582760725, "grad_norm": 1.9645607471466064, "learning_rate": 4.456904644616344e-10, "loss": 1.9869, "num_input_tokens_seen": 612246720, "step": 601300 }, { "epoch": 11.840680435510228, "grad_norm": 2.0595479011535645, "learning_rate": 4.348773306268594e-10, "loss": 1.9422, "num_input_tokens_seen": 612347272, "step": 601400 }, { "epoch": 11.84264928825973, "grad_norm": 2.179272413253784, "learning_rate": 4.241969270927237e-10, "loss": 2.0274, "num_input_tokens_seen": 612448768, "step": 601500 }, { "epoch": 11.844618141009233, "grad_norm": 1.8377439975738525, "learning_rate": 4.1364925669695736e-10, "loss": 1.9705, "num_input_tokens_seen": 612551168, "step": 601600 }, { "epoch": 11.846586993758736, "grad_norm": 1.7907642126083374, "learning_rate": 4.0323432224187435e-10, "loss": 2.0292, "num_input_tokens_seen": 612652048, "step": 601700 }, { "epoch": 11.84855584650824, "grad_norm": 1.8527629375457764, "learning_rate": 3.9295212649476105e-10, "loss": 1.9715, "num_input_tokens_seen": 612753936, "step": 601800 }, { "epoch": 11.850524699257743, "grad_norm": 1.6705670356750488, "learning_rate": 3.8280267218748773e-10, "loss": 1.9632, "num_input_tokens_seen": 612856336, "step": 601900 }, { "epoch": 11.852493552007246, "grad_norm": 1.7484914064407349, "learning_rate": 3.727859620166196e-10, "loss": 1.9986, "num_input_tokens_seen": 612957960, "step": 602000 }, { "epoch": 11.854462404756749, "grad_norm": 1.9333302974700928, "learning_rate": 3.6290199864358326e-10, "loss": 1.9875, "num_input_tokens_seen": 613059136, "step": 602100 }, { "epoch": 11.856431257506252, "grad_norm": 1.8948204517364502, "learning_rate": 3.531507846943893e-10, "loss": 1.9834, "num_input_tokens_seen": 613160608, "step": 602200 }, { "epoch": 11.858400110255754, "grad_norm": 2.2163522243499756, "learning_rate": 3.435323227599096e-10, "loss": 1.9761, "num_input_tokens_seen": 613262392, "step": 602300 }, { "epoch": 11.860368963005257, "grad_norm": 2.157076597213745, "learning_rate": 3.340466153957111e-10, "loss": 1.9958, "num_input_tokens_seen": 613364792, "step": 602400 }, { "epoch": 11.86233781575476, "grad_norm": 1.933104157447815, "learning_rate": 3.24693665122e-10, "loss": 1.9794, "num_input_tokens_seen": 613466696, "step": 602500 }, { "epoch": 11.864306668504263, "grad_norm": 2.285022020339966, "learning_rate": 3.1547347442384406e-10, "loss": 1.9959, "num_input_tokens_seen": 613567792, "step": 602600 }, { "epoch": 11.866275521253765, "grad_norm": 1.8007138967514038, "learning_rate": 3.063860457508949e-10, "loss": 2.0291, "num_input_tokens_seen": 613669760, "step": 602700 }, { "epoch": 11.868244374003268, "grad_norm": 1.9165966510772705, "learning_rate": 2.9743138151766546e-10, "loss": 1.9732, "num_input_tokens_seen": 613771232, "step": 602800 }, { "epoch": 11.870213226752771, "grad_norm": 1.8733264207839966, "learning_rate": 2.886094841033637e-10, "loss": 1.9883, "num_input_tokens_seen": 613871376, "step": 602900 }, { "epoch": 11.872182079502274, "grad_norm": 1.8785486221313477, "learning_rate": 2.799203558517815e-10, "loss": 2.0485, "num_input_tokens_seen": 613972416, "step": 603000 }, { "epoch": 11.874150932251776, "grad_norm": 2.178788185119629, "learning_rate": 2.713639990716832e-10, "loss": 1.9821, "num_input_tokens_seen": 614073528, "step": 603100 }, { "epoch": 11.87611978500128, "grad_norm": 1.9850804805755615, "learning_rate": 2.629404160363058e-10, "loss": 1.926, "num_input_tokens_seen": 614175928, "step": 603200 }, { "epoch": 11.878088637750782, "grad_norm": 2.022782325744629, "learning_rate": 2.546496089838035e-10, "loss": 1.9753, "num_input_tokens_seen": 614278328, "step": 603300 }, { "epoch": 11.880057490500285, "grad_norm": 2.0160112380981445, "learning_rate": 2.464915801169143e-10, "loss": 1.9688, "num_input_tokens_seen": 614379912, "step": 603400 }, { "epoch": 11.882026343249787, "grad_norm": 2.0020434856414795, "learning_rate": 2.384663316032376e-10, "loss": 2.0239, "num_input_tokens_seen": 614481512, "step": 603500 }, { "epoch": 11.883995195999292, "grad_norm": 1.973625659942627, "learning_rate": 2.3057386557495674e-10, "loss": 1.9905, "num_input_tokens_seen": 614583072, "step": 603600 }, { "epoch": 11.885964048748795, "grad_norm": 2.12579607963562, "learning_rate": 2.2281418412906095e-10, "loss": 1.9786, "num_input_tokens_seen": 614684672, "step": 603700 }, { "epoch": 11.887932901498298, "grad_norm": 1.7405120134353638, "learning_rate": 2.151872893271789e-10, "loss": 1.9951, "num_input_tokens_seen": 614786264, "step": 603800 }, { "epoch": 11.8899017542478, "grad_norm": 1.7212185859680176, "learning_rate": 2.0769318319580065e-10, "loss": 1.9598, "num_input_tokens_seen": 614888128, "step": 603900 }, { "epoch": 11.891870606997303, "grad_norm": 2.0192983150482178, "learning_rate": 2.0033186772594468e-10, "loss": 2.0105, "num_input_tokens_seen": 614990072, "step": 604000 }, { "epoch": 11.893839459746806, "grad_norm": 2.044783115386963, "learning_rate": 1.9310334487354641e-10, "loss": 1.982, "num_input_tokens_seen": 615091848, "step": 604100 }, { "epoch": 11.895808312496309, "grad_norm": 1.9060918092727661, "learning_rate": 1.8600761655912512e-10, "loss": 1.9581, "num_input_tokens_seen": 615194248, "step": 604200 }, { "epoch": 11.897777165245811, "grad_norm": 2.227484703063965, "learning_rate": 1.7904468466795054e-10, "loss": 1.9834, "num_input_tokens_seen": 615295800, "step": 604300 }, { "epoch": 11.899746017995314, "grad_norm": 2.0130362510681152, "learning_rate": 1.722145510499873e-10, "loss": 2.0045, "num_input_tokens_seen": 615398200, "step": 604400 }, { "epoch": 11.901714870744817, "grad_norm": 1.6936615705490112, "learning_rate": 1.655172175200059e-10, "loss": 1.9594, "num_input_tokens_seen": 615500072, "step": 604500 }, { "epoch": 11.90368372349432, "grad_norm": 2.2261927127838135, "learning_rate": 1.589526858574164e-10, "loss": 1.9738, "num_input_tokens_seen": 615601776, "step": 604600 }, { "epoch": 11.905652576243822, "grad_norm": 2.0863723754882812, "learning_rate": 1.5252095780637907e-10, "loss": 2.0208, "num_input_tokens_seen": 615704176, "step": 604700 }, { "epoch": 11.907621428993325, "grad_norm": 1.7478595972061157, "learning_rate": 1.4622203507569375e-10, "loss": 1.9997, "num_input_tokens_seen": 615805168, "step": 604800 }, { "epoch": 11.909590281742828, "grad_norm": 1.6486343145370483, "learning_rate": 1.400559193389106e-10, "loss": 2.0192, "num_input_tokens_seen": 615907568, "step": 604900 }, { "epoch": 11.91155913449233, "grad_norm": 1.9903755187988281, "learning_rate": 1.340226122344412e-10, "loss": 2.0046, "num_input_tokens_seen": 616009968, "step": 605000 }, { "epoch": 11.913527987241833, "grad_norm": 1.8391920328140259, "learning_rate": 1.2812211536517016e-10, "loss": 1.9669, "num_input_tokens_seen": 616111552, "step": 605100 }, { "epoch": 11.915496839991338, "grad_norm": 1.8323636054992676, "learning_rate": 1.223544302988433e-10, "loss": 1.9956, "num_input_tokens_seen": 616213952, "step": 605200 }, { "epoch": 11.91746569274084, "grad_norm": 1.731201410293579, "learning_rate": 1.167195585679015e-10, "loss": 2.0026, "num_input_tokens_seen": 616316352, "step": 605300 }, { "epoch": 11.919434545490343, "grad_norm": 2.0433614253997803, "learning_rate": 1.1121750166948052e-10, "loss": 1.9802, "num_input_tokens_seen": 616417960, "step": 605400 }, { "epoch": 11.921403398239846, "grad_norm": 1.8426910638809204, "learning_rate": 1.05848261065411e-10, "loss": 1.9955, "num_input_tokens_seen": 616520360, "step": 605500 }, { "epoch": 11.923372250989349, "grad_norm": 1.9176217317581177, "learning_rate": 1.0061183818227403e-10, "loss": 2.008, "num_input_tokens_seen": 616622760, "step": 605600 }, { "epoch": 11.925341103738852, "grad_norm": 1.7819119691848755, "learning_rate": 9.550823441134559e-11, "loss": 2.0313, "num_input_tokens_seen": 616723584, "step": 605700 }, { "epoch": 11.927309956488354, "grad_norm": 1.9518394470214844, "learning_rate": 9.053745110859656e-11, "loss": 2.01, "num_input_tokens_seen": 616824488, "step": 605800 }, { "epoch": 11.929278809237857, "grad_norm": 1.9034873247146606, "learning_rate": 8.569948959474827e-11, "loss": 2.0303, "num_input_tokens_seen": 616926888, "step": 605900 }, { "epoch": 11.93124766198736, "grad_norm": 2.1896543502807617, "learning_rate": 8.099435115521691e-11, "loss": 1.968, "num_input_tokens_seen": 617028512, "step": 606000 }, { "epoch": 11.933216514736863, "grad_norm": 1.953952670097351, "learning_rate": 7.642203704000261e-11, "loss": 2.0075, "num_input_tokens_seen": 617130264, "step": 606100 }, { "epoch": 11.935185367486365, "grad_norm": 1.916153073310852, "learning_rate": 7.198254846407792e-11, "loss": 1.979, "num_input_tokens_seen": 617232320, "step": 606200 }, { "epoch": 11.937154220235868, "grad_norm": 1.7991586923599243, "learning_rate": 6.767588660694378e-11, "loss": 2.0086, "num_input_tokens_seen": 617334088, "step": 606300 }, { "epoch": 11.939123072985371, "grad_norm": 1.894468903541565, "learning_rate": 6.350205261274055e-11, "loss": 1.9897, "num_input_tokens_seen": 617436488, "step": 606400 }, { "epoch": 11.941091925734874, "grad_norm": 1.8279722929000854, "learning_rate": 5.946104759058102e-11, "loss": 2.0329, "num_input_tokens_seen": 617537816, "step": 606500 }, { "epoch": 11.943060778484377, "grad_norm": 1.7615768909454346, "learning_rate": 5.555287261399533e-11, "loss": 1.9813, "num_input_tokens_seen": 617640216, "step": 606600 }, { "epoch": 11.94502963123388, "grad_norm": 1.692265510559082, "learning_rate": 5.1777528721430595e-11, "loss": 1.9675, "num_input_tokens_seen": 617742616, "step": 606700 }, { "epoch": 11.946998483983382, "grad_norm": 1.910791039466858, "learning_rate": 4.8135016915917815e-11, "loss": 2.0206, "num_input_tokens_seen": 617844024, "step": 606800 }, { "epoch": 11.948967336732887, "grad_norm": 2.1531341075897217, "learning_rate": 4.462533816523839e-11, "loss": 1.9695, "num_input_tokens_seen": 617946424, "step": 606900 }, { "epoch": 11.95093618948239, "grad_norm": 2.0287556648254395, "learning_rate": 4.1248493401924154e-11, "loss": 1.9561, "num_input_tokens_seen": 618048824, "step": 607000 }, { "epoch": 11.952905042231892, "grad_norm": 1.763012409210205, "learning_rate": 3.8004483523201846e-11, "loss": 1.9766, "num_input_tokens_seen": 618151224, "step": 607100 }, { "epoch": 11.954873894981395, "grad_norm": 1.935534954071045, "learning_rate": 3.4893309390882084e-11, "loss": 2.033, "num_input_tokens_seen": 618253480, "step": 607200 }, { "epoch": 11.956842747730898, "grad_norm": 1.7291593551635742, "learning_rate": 3.191497183169245e-11, "loss": 1.9738, "num_input_tokens_seen": 618355880, "step": 607300 }, { "epoch": 11.9588116004804, "grad_norm": 1.8182733058929443, "learning_rate": 2.9069471636833375e-11, "loss": 2.0233, "num_input_tokens_seen": 618457680, "step": 607400 }, { "epoch": 11.960780453229903, "grad_norm": 2.1735570430755615, "learning_rate": 2.6356809562422254e-11, "loss": 1.9637, "num_input_tokens_seen": 618559264, "step": 607500 }, { "epoch": 11.962749305979406, "grad_norm": 1.901559829711914, "learning_rate": 2.3776986329104853e-11, "loss": 1.9714, "num_input_tokens_seen": 618661664, "step": 607600 }, { "epoch": 11.964718158728909, "grad_norm": 2.0196540355682373, "learning_rate": 2.133000262244389e-11, "loss": 1.9451, "num_input_tokens_seen": 618763152, "step": 607700 }, { "epoch": 11.966687011478411, "grad_norm": 2.054826498031616, "learning_rate": 1.901585909247494e-11, "loss": 2.0128, "num_input_tokens_seen": 618865552, "step": 607800 }, { "epoch": 11.968655864227914, "grad_norm": 1.9896193742752075, "learning_rate": 1.683455635403952e-11, "loss": 1.9903, "num_input_tokens_seen": 618967856, "step": 607900 }, { "epoch": 11.970624716977417, "grad_norm": 1.827958106994629, "learning_rate": 1.478609498678507e-11, "loss": 2.005, "num_input_tokens_seen": 619069488, "step": 608000 }, { "epoch": 11.97259356972692, "grad_norm": 1.9769021272659302, "learning_rate": 1.2870475534942915e-11, "loss": 1.9584, "num_input_tokens_seen": 619171888, "step": 608100 }, { "epoch": 11.974562422476422, "grad_norm": 2.0425643920898438, "learning_rate": 1.1087698507383781e-11, "loss": 1.9683, "num_input_tokens_seen": 619273856, "step": 608200 }, { "epoch": 11.976531275225925, "grad_norm": 1.987949013710022, "learning_rate": 9.43776437789534e-12, "loss": 1.9758, "num_input_tokens_seen": 619374808, "step": 608300 }, { "epoch": 11.978500127975428, "grad_norm": 1.9493474960327148, "learning_rate": 7.920673584793646e-12, "loss": 2.0262, "num_input_tokens_seen": 619476568, "step": 608400 }, { "epoch": 11.980468980724932, "grad_norm": 8.260712623596191, "learning_rate": 6.536426531145167e-12, "loss": 1.9921, "num_input_tokens_seen": 619578008, "step": 608500 }, { "epoch": 11.982437833474435, "grad_norm": 1.871995210647583, "learning_rate": 5.28502358476679e-12, "loss": 1.991, "num_input_tokens_seen": 619680408, "step": 608600 }, { "epoch": 11.984406686223938, "grad_norm": 1.9008432626724243, "learning_rate": 4.1664650781148e-12, "loss": 1.9481, "num_input_tokens_seen": 619782808, "step": 608700 }, { "epoch": 11.98637553897344, "grad_norm": 1.7159595489501953, "learning_rate": 3.1807513084514127e-12, "loss": 1.9444, "num_input_tokens_seen": 619885000, "step": 608800 }, { "epoch": 11.988344391722944, "grad_norm": 2.0267257690429688, "learning_rate": 2.3278825375672163e-12, "loss": 1.957, "num_input_tokens_seen": 619987400, "step": 608900 }, { "epoch": 11.990313244472446, "grad_norm": 2.0029587745666504, "learning_rate": 1.6078589921142417e-12, "loss": 2.0438, "num_input_tokens_seen": 620089800, "step": 609000 }, { "epoch": 11.992282097221949, "grad_norm": 1.907485842704773, "learning_rate": 1.0206808634394271e-12, "loss": 1.9305, "num_input_tokens_seen": 620192200, "step": 609100 }, { "epoch": 11.994250949971452, "grad_norm": 2.12469220161438, "learning_rate": 5.663483074735965e-13, "loss": 1.9854, "num_input_tokens_seen": 620294600, "step": 609200 }, { "epoch": 11.996219802720955, "grad_norm": 1.963167667388916, "learning_rate": 2.4486144495350357e-13, "loss": 1.9765, "num_input_tokens_seen": 620397000, "step": 609300 }, { "epoch": 11.998188655470457, "grad_norm": 1.826084852218628, "learning_rate": 5.622036136632147e-14, "loss": 1.9388, "num_input_tokens_seen": 620499400, "step": 609400 }, { "epoch": 12.0, "num_input_tokens_seen": 620593344, "step": 609492, "total_flos": 3.16711124803584e+17, "train_loss": 0.07982336108915004, "train_runtime": 2874.0318, "train_samples_per_second": 212.069, "train_steps_per_second": 212.069 } ], "logging_steps": 100, "max_steps": 609492, "num_input_tokens_seen": 620593344, "num_train_epochs": 12, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.16711124803584e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }