{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.991735537190083, "eval_steps": 500, "global_step": 2495, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020035061357375405, "grad_norm": 34.802622580516726, "learning_rate": 0.0, "loss": 2.5618, "num_tokens": 4194304.0, "step": 1 }, { "epoch": 0.004007012271475081, "grad_norm": 34.477558182780584, "learning_rate": 5.333333333333335e-07, "loss": 2.5574, "num_tokens": 8365923.0, "step": 2 }, { "epoch": 0.006010518407212622, "grad_norm": 35.191274246541376, "learning_rate": 1.066666666666667e-06, "loss": 2.5746, "num_tokens": 12488240.0, "step": 3 }, { "epoch": 0.008014024542950162, "grad_norm": 35.73089543316278, "learning_rate": 1.6000000000000001e-06, "loss": 2.5439, "num_tokens": 16682544.0, "step": 4 }, { "epoch": 0.010017530678687703, "grad_norm": 35.33049289971007, "learning_rate": 2.133333333333334e-06, "loss": 2.4818, "num_tokens": 20876848.0, "step": 5 }, { "epoch": 0.012021036814425245, "grad_norm": 34.85540028158663, "learning_rate": 2.666666666666667e-06, "loss": 2.3245, "num_tokens": 25071152.0, "step": 6 }, { "epoch": 0.014024542950162784, "grad_norm": 29.82332014595971, "learning_rate": 3.2000000000000003e-06, "loss": 1.9698, "num_tokens": 29239821.0, "step": 7 }, { "epoch": 0.016028049085900324, "grad_norm": 28.49703738519385, "learning_rate": 3.7333333333333337e-06, "loss": 1.937, "num_tokens": 33434125.0, "step": 8 }, { "epoch": 0.018031555221637866, "grad_norm": 25.192612922745788, "learning_rate": 4.266666666666668e-06, "loss": 1.8356, "num_tokens": 37618622.0, "step": 9 }, { "epoch": 0.020035061357375407, "grad_norm": 10.008528906079711, "learning_rate": 4.800000000000001e-06, "loss": 1.4497, "num_tokens": 41792622.0, "step": 10 }, { "epoch": 0.02203856749311295, "grad_norm": 8.556646993119584, "learning_rate": 5.333333333333334e-06, "loss": 1.3668, "num_tokens": 45986926.0, "step": 11 }, { "epoch": 0.02404207362885049, "grad_norm": 7.7780940012034945, "learning_rate": 5.8666666666666675e-06, "loss": 1.3467, "num_tokens": 50180418.0, "step": 12 }, { "epoch": 0.026045579764588028, "grad_norm": 39.5730504562258, "learning_rate": 6.4000000000000006e-06, "loss": 1.2271, "num_tokens": 54374722.0, "step": 13 }, { "epoch": 0.02804908590032557, "grad_norm": 12.64439488632276, "learning_rate": 6.9333333333333344e-06, "loss": 1.2069, "num_tokens": 58558587.0, "step": 14 }, { "epoch": 0.03005259203606311, "grad_norm": 4.168261331238969, "learning_rate": 7.4666666666666675e-06, "loss": 1.1509, "num_tokens": 62746166.0, "step": 15 }, { "epoch": 0.03205609817180065, "grad_norm": 3.3251246568702224, "learning_rate": 8.000000000000001e-06, "loss": 1.1087, "num_tokens": 66927291.0, "step": 16 }, { "epoch": 0.03405960430753819, "grad_norm": 2.4574877551516465, "learning_rate": 8.533333333333335e-06, "loss": 1.0511, "num_tokens": 71121595.0, "step": 17 }, { "epoch": 0.03606311044327573, "grad_norm": 1.8495441071855903, "learning_rate": 9.066666666666667e-06, "loss": 1.0167, "num_tokens": 75315899.0, "step": 18 }, { "epoch": 0.03806661657901327, "grad_norm": 1.5237545475155407, "learning_rate": 9.600000000000001e-06, "loss": 0.9816, "num_tokens": 79510203.0, "step": 19 }, { "epoch": 0.040070122714750814, "grad_norm": 1.4614636525035523, "learning_rate": 1.0133333333333335e-05, "loss": 0.9677, "num_tokens": 83688343.0, "step": 20 }, { "epoch": 0.042073628850488355, "grad_norm": 1.0416696407052841, "learning_rate": 1.0666666666666667e-05, "loss": 0.9205, "num_tokens": 87872724.0, "step": 21 }, { "epoch": 0.0440771349862259, "grad_norm": 0.9709016662910727, "learning_rate": 1.1200000000000001e-05, "loss": 0.9069, "num_tokens": 92053509.0, "step": 22 }, { "epoch": 0.04608064112196344, "grad_norm": 0.9243427486381531, "learning_rate": 1.1733333333333335e-05, "loss": 0.9032, "num_tokens": 96230102.0, "step": 23 }, { "epoch": 0.04808414725770098, "grad_norm": 0.6551293212181138, "learning_rate": 1.2266666666666667e-05, "loss": 0.8688, "num_tokens": 100419509.0, "step": 24 }, { "epoch": 0.05008765339343852, "grad_norm": 0.8032149625293368, "learning_rate": 1.2800000000000001e-05, "loss": 0.8358, "num_tokens": 104599822.0, "step": 25 }, { "epoch": 0.052091159529176055, "grad_norm": 0.7312977976468741, "learning_rate": 1.3333333333333333e-05, "loss": 0.8317, "num_tokens": 108770496.0, "step": 26 }, { "epoch": 0.054094665664913597, "grad_norm": 0.5503624992990909, "learning_rate": 1.3866666666666669e-05, "loss": 0.8097, "num_tokens": 112964800.0, "step": 27 }, { "epoch": 0.05609817180065114, "grad_norm": 0.5042961571074351, "learning_rate": 1.4400000000000001e-05, "loss": 0.818, "num_tokens": 117159104.0, "step": 28 }, { "epoch": 0.05810167793638868, "grad_norm": 0.6024241352735147, "learning_rate": 1.4933333333333335e-05, "loss": 0.8064, "num_tokens": 121285725.0, "step": 29 }, { "epoch": 0.06010518407212622, "grad_norm": 0.47170011035136544, "learning_rate": 1.546666666666667e-05, "loss": 0.7956, "num_tokens": 125455932.0, "step": 30 }, { "epoch": 0.06210869020786376, "grad_norm": 0.4657061005864622, "learning_rate": 1.6000000000000003e-05, "loss": 0.7899, "num_tokens": 129645742.0, "step": 31 }, { "epoch": 0.0641121963436013, "grad_norm": 0.4396529059771875, "learning_rate": 1.6533333333333333e-05, "loss": 0.7715, "num_tokens": 133840046.0, "step": 32 }, { "epoch": 0.06611570247933884, "grad_norm": 0.5126679822169967, "learning_rate": 1.706666666666667e-05, "loss": 0.777, "num_tokens": 138001668.0, "step": 33 }, { "epoch": 0.06811920861507638, "grad_norm": 0.36627325634146407, "learning_rate": 1.76e-05, "loss": 0.7528, "num_tokens": 142195972.0, "step": 34 }, { "epoch": 0.07012271475081393, "grad_norm": 0.5725558936997536, "learning_rate": 1.8133333333333335e-05, "loss": 0.768, "num_tokens": 146386929.0, "step": 35 }, { "epoch": 0.07212622088655146, "grad_norm": 0.4378284603244199, "learning_rate": 1.866666666666667e-05, "loss": 0.7708, "num_tokens": 150581233.0, "step": 36 }, { "epoch": 0.07412972702228901, "grad_norm": 0.5625784738595206, "learning_rate": 1.9200000000000003e-05, "loss": 0.7608, "num_tokens": 154775537.0, "step": 37 }, { "epoch": 0.07613323315802654, "grad_norm": 0.4402040801997641, "learning_rate": 1.9733333333333336e-05, "loss": 0.7505, "num_tokens": 158969841.0, "step": 38 }, { "epoch": 0.0781367392937641, "grad_norm": 0.49680165474973187, "learning_rate": 2.026666666666667e-05, "loss": 0.7537, "num_tokens": 163120308.0, "step": 39 }, { "epoch": 0.08014024542950163, "grad_norm": 0.5382469066903293, "learning_rate": 2.08e-05, "loss": 0.7501, "num_tokens": 167294610.0, "step": 40 }, { "epoch": 0.08214375156523916, "grad_norm": 0.4416600418522094, "learning_rate": 2.1333333333333335e-05, "loss": 0.7445, "num_tokens": 171474196.0, "step": 41 }, { "epoch": 0.08414725770097671, "grad_norm": 0.5299636170020358, "learning_rate": 2.186666666666667e-05, "loss": 0.7311, "num_tokens": 175668500.0, "step": 42 }, { "epoch": 0.08615076383671424, "grad_norm": 0.4941417628566962, "learning_rate": 2.2400000000000002e-05, "loss": 0.7257, "num_tokens": 179854962.0, "step": 43 }, { "epoch": 0.0881542699724518, "grad_norm": 0.3676819344505601, "learning_rate": 2.2933333333333336e-05, "loss": 0.7353, "num_tokens": 184049266.0, "step": 44 }, { "epoch": 0.09015777610818933, "grad_norm": 0.3318644540817694, "learning_rate": 2.346666666666667e-05, "loss": 0.7225, "num_tokens": 188243570.0, "step": 45 }, { "epoch": 0.09216128224392688, "grad_norm": 0.33085378953630096, "learning_rate": 2.4e-05, "loss": 0.7185, "num_tokens": 192403054.0, "step": 46 }, { "epoch": 0.09416478837966441, "grad_norm": 0.35156027225127384, "learning_rate": 2.4533333333333334e-05, "loss": 0.6978, "num_tokens": 196591912.0, "step": 47 }, { "epoch": 0.09616829451540196, "grad_norm": 0.3296249851514653, "learning_rate": 2.5066666666666672e-05, "loss": 0.7209, "num_tokens": 200712370.0, "step": 48 }, { "epoch": 0.09817180065113949, "grad_norm": 0.36570436629913133, "learning_rate": 2.5600000000000002e-05, "loss": 0.7212, "num_tokens": 204885796.0, "step": 49 }, { "epoch": 0.10017530678687704, "grad_norm": 0.45643939988748944, "learning_rate": 2.6133333333333336e-05, "loss": 0.7139, "num_tokens": 209080100.0, "step": 50 }, { "epoch": 0.10217881292261458, "grad_norm": 0.3062895499643401, "learning_rate": 2.6666666666666667e-05, "loss": 0.7046, "num_tokens": 213274404.0, "step": 51 }, { "epoch": 0.10418231905835211, "grad_norm": 0.5006549946338266, "learning_rate": 2.7200000000000004e-05, "loss": 0.7131, "num_tokens": 217464633.0, "step": 52 }, { "epoch": 0.10618582519408966, "grad_norm": 0.3386937392761925, "learning_rate": 2.7733333333333338e-05, "loss": 0.7066, "num_tokens": 221658937.0, "step": 53 }, { "epoch": 0.10818933132982719, "grad_norm": 0.594697116041374, "learning_rate": 2.8266666666666668e-05, "loss": 0.6887, "num_tokens": 225782679.0, "step": 54 }, { "epoch": 0.11019283746556474, "grad_norm": 0.47344213129954177, "learning_rate": 2.8800000000000002e-05, "loss": 0.7072, "num_tokens": 229968653.0, "step": 55 }, { "epoch": 0.11219634360130228, "grad_norm": 0.6638141335065528, "learning_rate": 2.9333333333333333e-05, "loss": 0.6973, "num_tokens": 234110082.0, "step": 56 }, { "epoch": 0.11419984973703982, "grad_norm": 0.6004620926307082, "learning_rate": 2.986666666666667e-05, "loss": 0.6763, "num_tokens": 238304386.0, "step": 57 }, { "epoch": 0.11620335587277736, "grad_norm": 0.44153036997030487, "learning_rate": 3.0400000000000004e-05, "loss": 0.7042, "num_tokens": 242472776.0, "step": 58 }, { "epoch": 0.1182068620085149, "grad_norm": 0.6204788706605988, "learning_rate": 3.093333333333334e-05, "loss": 0.6845, "num_tokens": 246658178.0, "step": 59 }, { "epoch": 0.12021036814425244, "grad_norm": 0.3734751627828484, "learning_rate": 3.146666666666667e-05, "loss": 0.6961, "num_tokens": 250821747.0, "step": 60 }, { "epoch": 0.12221387427998998, "grad_norm": 0.688392541057556, "learning_rate": 3.2000000000000005e-05, "loss": 0.6975, "num_tokens": 254966155.0, "step": 61 }, { "epoch": 0.12421738041572752, "grad_norm": 0.6325685908606387, "learning_rate": 3.2533333333333336e-05, "loss": 0.6774, "num_tokens": 259091961.0, "step": 62 }, { "epoch": 0.12622088655146507, "grad_norm": 0.5966035925864989, "learning_rate": 3.3066666666666666e-05, "loss": 0.6991, "num_tokens": 263267094.0, "step": 63 }, { "epoch": 0.1282243926872026, "grad_norm": 0.5793726726003035, "learning_rate": 3.3600000000000004e-05, "loss": 0.6721, "num_tokens": 267439775.0, "step": 64 }, { "epoch": 0.13022789882294014, "grad_norm": 0.4997295391234862, "learning_rate": 3.413333333333334e-05, "loss": 0.6743, "num_tokens": 271634079.0, "step": 65 }, { "epoch": 0.1322314049586777, "grad_norm": 0.5565525456120014, "learning_rate": 3.466666666666667e-05, "loss": 0.7045, "num_tokens": 275817340.0, "step": 66 }, { "epoch": 0.13423491109441524, "grad_norm": 0.49022372595764185, "learning_rate": 3.52e-05, "loss": 0.6955, "num_tokens": 280011644.0, "step": 67 }, { "epoch": 0.13623841723015276, "grad_norm": 0.7464894939014075, "learning_rate": 3.573333333333333e-05, "loss": 0.6864, "num_tokens": 284205948.0, "step": 68 }, { "epoch": 0.1382419233658903, "grad_norm": 0.6277071610563707, "learning_rate": 3.626666666666667e-05, "loss": 0.6774, "num_tokens": 288400252.0, "step": 69 }, { "epoch": 0.14024542950162786, "grad_norm": 0.7234462962650683, "learning_rate": 3.680000000000001e-05, "loss": 0.6785, "num_tokens": 292565996.0, "step": 70 }, { "epoch": 0.14224893563736538, "grad_norm": 0.536547788319989, "learning_rate": 3.733333333333334e-05, "loss": 0.6793, "num_tokens": 296760300.0, "step": 71 }, { "epoch": 0.14425244177310292, "grad_norm": 0.7078247847148855, "learning_rate": 3.786666666666667e-05, "loss": 0.6747, "num_tokens": 300929996.0, "step": 72 }, { "epoch": 0.14625594790884047, "grad_norm": 0.5029066996835548, "learning_rate": 3.8400000000000005e-05, "loss": 0.6947, "num_tokens": 305068677.0, "step": 73 }, { "epoch": 0.14825945404457802, "grad_norm": 0.6755561449019495, "learning_rate": 3.8933333333333336e-05, "loss": 0.6824, "num_tokens": 309262981.0, "step": 74 }, { "epoch": 0.15026296018031554, "grad_norm": 0.5507016726035784, "learning_rate": 3.946666666666667e-05, "loss": 0.6862, "num_tokens": 313457285.0, "step": 75 }, { "epoch": 0.1522664663160531, "grad_norm": 0.5963927618675963, "learning_rate": 4e-05, "loss": 0.6804, "num_tokens": 317638533.0, "step": 76 }, { "epoch": 0.15426997245179064, "grad_norm": 0.6424366422807498, "learning_rate": 3.999998483258877e-05, "loss": 0.6654, "num_tokens": 321810983.0, "step": 77 }, { "epoch": 0.1562734785875282, "grad_norm": 0.5207952547644799, "learning_rate": 3.999993933038064e-05, "loss": 0.6726, "num_tokens": 326005287.0, "step": 78 }, { "epoch": 0.1582769847232657, "grad_norm": 0.5319441513564792, "learning_rate": 3.9999863493452286e-05, "loss": 0.6649, "num_tokens": 330199591.0, "step": 79 }, { "epoch": 0.16028049085900326, "grad_norm": 0.38028224450615095, "learning_rate": 3.999975732193152e-05, "loss": 0.6722, "num_tokens": 334388425.0, "step": 80 }, { "epoch": 0.1622839969947408, "grad_norm": 0.9664154313834891, "learning_rate": 3.999962081599728e-05, "loss": 0.6726, "num_tokens": 338552965.0, "step": 81 }, { "epoch": 0.16428750313047832, "grad_norm": 0.6238267815231743, "learning_rate": 3.999945397587959e-05, "loss": 0.6757, "num_tokens": 342726730.0, "step": 82 }, { "epoch": 0.16629100926621587, "grad_norm": 0.9444803094075671, "learning_rate": 3.999925680185964e-05, "loss": 0.6822, "num_tokens": 346886500.0, "step": 83 }, { "epoch": 0.16829451540195342, "grad_norm": 0.7172909062281609, "learning_rate": 3.9999029294269716e-05, "loss": 0.6687, "num_tokens": 351062133.0, "step": 84 }, { "epoch": 0.17029802153769097, "grad_norm": 0.9729021515740269, "learning_rate": 3.999877145349323e-05, "loss": 0.692, "num_tokens": 355256437.0, "step": 85 }, { "epoch": 0.1723015276734285, "grad_norm": 0.8519276508837444, "learning_rate": 3.999848327996471e-05, "loss": 0.6528, "num_tokens": 359437666.0, "step": 86 }, { "epoch": 0.17430503380916604, "grad_norm": 0.7015378137866779, "learning_rate": 3.99981647741698e-05, "loss": 0.6733, "num_tokens": 363631970.0, "step": 87 }, { "epoch": 0.1763085399449036, "grad_norm": 0.8030151165304644, "learning_rate": 3.9997815936645285e-05, "loss": 0.6673, "num_tokens": 367826274.0, "step": 88 }, { "epoch": 0.17831204608064113, "grad_norm": 0.6964760376939347, "learning_rate": 3.999743676797903e-05, "loss": 0.6619, "num_tokens": 372020578.0, "step": 89 }, { "epoch": 0.18031555221637866, "grad_norm": 0.729788156726713, "learning_rate": 3.999702726881005e-05, "loss": 0.6631, "num_tokens": 376194351.0, "step": 90 }, { "epoch": 0.1823190583521162, "grad_norm": 0.7060369400278887, "learning_rate": 3.999658743982846e-05, "loss": 0.6674, "num_tokens": 380370256.0, "step": 91 }, { "epoch": 0.18432256448785375, "grad_norm": 0.6745929131907502, "learning_rate": 3.999611728177548e-05, "loss": 0.6761, "num_tokens": 384534269.0, "step": 92 }, { "epoch": 0.18632607062359127, "grad_norm": 0.6443255886834249, "learning_rate": 3.999561679544346e-05, "loss": 0.6631, "num_tokens": 388682932.0, "step": 93 }, { "epoch": 0.18832957675932882, "grad_norm": 0.5883783348158106, "learning_rate": 3.9995085981675846e-05, "loss": 0.667, "num_tokens": 392877236.0, "step": 94 }, { "epoch": 0.19033308289506637, "grad_norm": 0.5765310897205485, "learning_rate": 3.999452484136722e-05, "loss": 0.6544, "num_tokens": 397066877.0, "step": 95 }, { "epoch": 0.19233658903080392, "grad_norm": 0.4773718950654361, "learning_rate": 3.9993933375463224e-05, "loss": 0.6623, "num_tokens": 401234310.0, "step": 96 }, { "epoch": 0.19434009516654144, "grad_norm": 0.5248800386897873, "learning_rate": 3.999331158496066e-05, "loss": 0.6589, "num_tokens": 405398894.0, "step": 97 }, { "epoch": 0.19634360130227899, "grad_norm": 0.42662586210806297, "learning_rate": 3.999265947090741e-05, "loss": 0.6624, "num_tokens": 409584139.0, "step": 98 }, { "epoch": 0.19834710743801653, "grad_norm": 0.5912435267621513, "learning_rate": 3.999197703440245e-05, "loss": 0.6494, "num_tokens": 413778443.0, "step": 99 }, { "epoch": 0.20035061357375408, "grad_norm": 0.46051782601003877, "learning_rate": 3.999126427659588e-05, "loss": 0.659, "num_tokens": 417948035.0, "step": 100 }, { "epoch": 0.2023541197094916, "grad_norm": 0.4966663684560842, "learning_rate": 3.999052119868887e-05, "loss": 0.6567, "num_tokens": 422127334.0, "step": 101 }, { "epoch": 0.20435762584522915, "grad_norm": 0.5257731805789438, "learning_rate": 3.9989747801933724e-05, "loss": 0.6583, "num_tokens": 426321638.0, "step": 102 }, { "epoch": 0.2063611319809667, "grad_norm": 0.47738215828229424, "learning_rate": 3.998894408763382e-05, "loss": 0.6646, "num_tokens": 430477388.0, "step": 103 }, { "epoch": 0.20836463811670422, "grad_norm": 0.4379451851616906, "learning_rate": 3.998811005714362e-05, "loss": 0.6476, "num_tokens": 434612031.0, "step": 104 }, { "epoch": 0.21036814425244177, "grad_norm": 0.41355052386566354, "learning_rate": 3.99872457118687e-05, "loss": 0.6753, "num_tokens": 438806335.0, "step": 105 }, { "epoch": 0.21237165038817932, "grad_norm": 0.42506318625963985, "learning_rate": 3.998635105326571e-05, "loss": 0.658, "num_tokens": 442975668.0, "step": 106 }, { "epoch": 0.21437515652391687, "grad_norm": 0.3747156602764816, "learning_rate": 3.9985426082842387e-05, "loss": 0.6537, "num_tokens": 447169972.0, "step": 107 }, { "epoch": 0.21637866265965439, "grad_norm": 0.42330783728407684, "learning_rate": 3.9984470802157556e-05, "loss": 0.6577, "num_tokens": 451364276.0, "step": 108 }, { "epoch": 0.21838216879539193, "grad_norm": 0.30391857540658146, "learning_rate": 3.998348521282113e-05, "loss": 0.6581, "num_tokens": 455558580.0, "step": 109 }, { "epoch": 0.22038567493112948, "grad_norm": 0.5183109496802537, "learning_rate": 3.998246931649407e-05, "loss": 0.6604, "num_tokens": 459723912.0, "step": 110 }, { "epoch": 0.22238918106686703, "grad_norm": 0.4149585950459982, "learning_rate": 3.998142311488845e-05, "loss": 0.6536, "num_tokens": 463906994.0, "step": 111 }, { "epoch": 0.22439268720260455, "grad_norm": 0.4531326937842188, "learning_rate": 3.99803466097674e-05, "loss": 0.6613, "num_tokens": 468076295.0, "step": 112 }, { "epoch": 0.2263961933383421, "grad_norm": 0.4710774124531649, "learning_rate": 3.997923980294511e-05, "loss": 0.6545, "num_tokens": 472270599.0, "step": 113 }, { "epoch": 0.22839969947407965, "grad_norm": 1.0443380737393089, "learning_rate": 3.997810269628686e-05, "loss": 0.655, "num_tokens": 476464903.0, "step": 114 }, { "epoch": 0.23040320560981717, "grad_norm": 0.5453124787456369, "learning_rate": 3.997693529170896e-05, "loss": 0.6709, "num_tokens": 480636497.0, "step": 115 }, { "epoch": 0.23240671174555472, "grad_norm": 0.28969727739667056, "learning_rate": 3.997573759117882e-05, "loss": 0.6525, "num_tokens": 484830801.0, "step": 116 }, { "epoch": 0.23441021788129227, "grad_norm": 0.5128340209938913, "learning_rate": 3.997450959671487e-05, "loss": 0.6625, "num_tokens": 489025105.0, "step": 117 }, { "epoch": 0.2364137240170298, "grad_norm": 0.36332536595618165, "learning_rate": 3.9973251310386616e-05, "loss": 0.6647, "num_tokens": 493210718.0, "step": 118 }, { "epoch": 0.23841723015276733, "grad_norm": 0.4513185046868507, "learning_rate": 3.997196273431461e-05, "loss": 0.6499, "num_tokens": 497387117.0, "step": 119 }, { "epoch": 0.24042073628850488, "grad_norm": 0.4199789536650745, "learning_rate": 3.997064387067044e-05, "loss": 0.6448, "num_tokens": 501581421.0, "step": 120 }, { "epoch": 0.24242424242424243, "grad_norm": 0.29453377282293347, "learning_rate": 3.996929472167676e-05, "loss": 0.6539, "num_tokens": 505775725.0, "step": 121 }, { "epoch": 0.24442774855997995, "grad_norm": 0.46174459757284936, "learning_rate": 3.9967915289607234e-05, "loss": 0.6512, "num_tokens": 509942189.0, "step": 122 }, { "epoch": 0.2464312546957175, "grad_norm": 0.5283354259508714, "learning_rate": 3.9966505576786575e-05, "loss": 0.6499, "num_tokens": 514105295.0, "step": 123 }, { "epoch": 0.24843476083145505, "grad_norm": 0.3677268219462525, "learning_rate": 3.9965065585590535e-05, "loss": 0.6549, "num_tokens": 518299599.0, "step": 124 }, { "epoch": 0.25043826696719257, "grad_norm": 0.5743699089851091, "learning_rate": 3.9963595318445876e-05, "loss": 0.6513, "num_tokens": 522493903.0, "step": 125 }, { "epoch": 0.25244177310293014, "grad_norm": 0.3280812566170468, "learning_rate": 3.9962094777830394e-05, "loss": 0.6441, "num_tokens": 526688207.0, "step": 126 }, { "epoch": 0.25444527923866767, "grad_norm": 0.6971759771917562, "learning_rate": 3.996056396627291e-05, "loss": 0.642, "num_tokens": 530877438.0, "step": 127 }, { "epoch": 0.2564487853744052, "grad_norm": 0.5248625974721883, "learning_rate": 3.995900288635324e-05, "loss": 0.6657, "num_tokens": 535027987.0, "step": 128 }, { "epoch": 0.25845229151014276, "grad_norm": 0.6750236226923346, "learning_rate": 3.9957411540702236e-05, "loss": 0.6434, "num_tokens": 539208832.0, "step": 129 }, { "epoch": 0.2604557976458803, "grad_norm": 0.5005511752350783, "learning_rate": 3.995578993200173e-05, "loss": 0.6541, "num_tokens": 543376378.0, "step": 130 }, { "epoch": 0.26245930378161786, "grad_norm": 0.6408582812821989, "learning_rate": 3.9954138062984565e-05, "loss": 0.6512, "num_tokens": 547570682.0, "step": 131 }, { "epoch": 0.2644628099173554, "grad_norm": 0.5782229312353571, "learning_rate": 3.9952455936434596e-05, "loss": 0.6511, "num_tokens": 551764986.0, "step": 132 }, { "epoch": 0.2664663160530929, "grad_norm": 0.5645731919212355, "learning_rate": 3.995074355518664e-05, "loss": 0.644, "num_tokens": 555959290.0, "step": 133 }, { "epoch": 0.2684698221888305, "grad_norm": 0.4310466445934037, "learning_rate": 3.994900092212654e-05, "loss": 0.6574, "num_tokens": 560153594.0, "step": 134 }, { "epoch": 0.270473328324568, "grad_norm": 0.4856208551340596, "learning_rate": 3.9947228040191076e-05, "loss": 0.6533, "num_tokens": 564347898.0, "step": 135 }, { "epoch": 0.2724768344603055, "grad_norm": 0.4369990908033775, "learning_rate": 3.9945424912368045e-05, "loss": 0.6617, "num_tokens": 568542202.0, "step": 136 }, { "epoch": 0.2744803405960431, "grad_norm": 0.4103547855269695, "learning_rate": 3.9943591541696195e-05, "loss": 0.6311, "num_tokens": 572715701.0, "step": 137 }, { "epoch": 0.2764838467317806, "grad_norm": 0.4513731865739979, "learning_rate": 3.994172793126524e-05, "loss": 0.6475, "num_tokens": 576886580.0, "step": 138 }, { "epoch": 0.27848735286751813, "grad_norm": 0.3441980960460298, "learning_rate": 3.993983408421588e-05, "loss": 0.6582, "num_tokens": 581080884.0, "step": 139 }, { "epoch": 0.2804908590032557, "grad_norm": 0.5836774358964197, "learning_rate": 3.993791000373974e-05, "loss": 0.6351, "num_tokens": 585275188.0, "step": 140 }, { "epoch": 0.28249436513899323, "grad_norm": 0.43875439673313993, "learning_rate": 3.993595569307942e-05, "loss": 0.6344, "num_tokens": 589442266.0, "step": 141 }, { "epoch": 0.28449787127473075, "grad_norm": 0.548769503170393, "learning_rate": 3.993397115552845e-05, "loss": 0.6472, "num_tokens": 593636570.0, "step": 142 }, { "epoch": 0.2865013774104683, "grad_norm": 0.5565889407083611, "learning_rate": 3.993195639443131e-05, "loss": 0.6503, "num_tokens": 597816936.0, "step": 143 }, { "epoch": 0.28850488354620585, "grad_norm": 0.3828409931326108, "learning_rate": 3.992991141318341e-05, "loss": 0.6461, "num_tokens": 601964276.0, "step": 144 }, { "epoch": 0.2905083896819434, "grad_norm": 0.48232140382949695, "learning_rate": 3.99278362152311e-05, "loss": 0.6435, "num_tokens": 606158580.0, "step": 145 }, { "epoch": 0.29251189581768094, "grad_norm": 0.46393450789957874, "learning_rate": 3.992573080407163e-05, "loss": 0.6425, "num_tokens": 610352884.0, "step": 146 }, { "epoch": 0.29451540195341847, "grad_norm": 0.4291349180874596, "learning_rate": 3.99235951832532e-05, "loss": 0.6391, "num_tokens": 614521548.0, "step": 147 }, { "epoch": 0.29651890808915604, "grad_norm": 0.43556312839610645, "learning_rate": 3.992142935637489e-05, "loss": 0.6551, "num_tokens": 618692882.0, "step": 148 }, { "epoch": 0.29852241422489356, "grad_norm": 0.41740472694700836, "learning_rate": 3.9919233327086705e-05, "loss": 0.6393, "num_tokens": 622835094.0, "step": 149 }, { "epoch": 0.3005259203606311, "grad_norm": 0.41001929530366726, "learning_rate": 3.991700709908953e-05, "loss": 0.6404, "num_tokens": 626997459.0, "step": 150 }, { "epoch": 0.30252942649636866, "grad_norm": 0.42208615886787587, "learning_rate": 3.991475067613517e-05, "loss": 0.6492, "num_tokens": 631191763.0, "step": 151 }, { "epoch": 0.3045329326321062, "grad_norm": 0.3193907267378484, "learning_rate": 3.99124640620263e-05, "loss": 0.6654, "num_tokens": 635375436.0, "step": 152 }, { "epoch": 0.3065364387678437, "grad_norm": 0.352661600838445, "learning_rate": 3.991014726061648e-05, "loss": 0.632, "num_tokens": 639529184.0, "step": 153 }, { "epoch": 0.3085399449035813, "grad_norm": 0.37513041759164856, "learning_rate": 3.990780027581013e-05, "loss": 0.6356, "num_tokens": 643714363.0, "step": 154 }, { "epoch": 0.3105434510393188, "grad_norm": 0.33964645208504574, "learning_rate": 3.990542311156255e-05, "loss": 0.6466, "num_tokens": 647881176.0, "step": 155 }, { "epoch": 0.3125469571750564, "grad_norm": 0.3199359347599082, "learning_rate": 3.990301577187991e-05, "loss": 0.6337, "num_tokens": 652075480.0, "step": 156 }, { "epoch": 0.3145504633107939, "grad_norm": 0.3657883355314684, "learning_rate": 3.990057826081921e-05, "loss": 0.6383, "num_tokens": 656269784.0, "step": 157 }, { "epoch": 0.3165539694465314, "grad_norm": 0.39152313013246826, "learning_rate": 3.989811058248832e-05, "loss": 0.6556, "num_tokens": 660464088.0, "step": 158 }, { "epoch": 0.318557475582269, "grad_norm": 0.28764428777484086, "learning_rate": 3.989561274104593e-05, "loss": 0.6392, "num_tokens": 664658392.0, "step": 159 }, { "epoch": 0.3205609817180065, "grad_norm": 0.40651978567764446, "learning_rate": 3.9893084740701576e-05, "loss": 0.6201, "num_tokens": 668852696.0, "step": 160 }, { "epoch": 0.32256448785374403, "grad_norm": 0.4833284044043898, "learning_rate": 3.989052658571561e-05, "loss": 0.6269, "num_tokens": 673047000.0, "step": 161 }, { "epoch": 0.3245679939894816, "grad_norm": 0.25173103843593436, "learning_rate": 3.9887938280399224e-05, "loss": 0.643, "num_tokens": 677241304.0, "step": 162 }, { "epoch": 0.3265715001252191, "grad_norm": 0.5583295917190269, "learning_rate": 3.9885319829114386e-05, "loss": 0.6348, "num_tokens": 681435608.0, "step": 163 }, { "epoch": 0.32857500626095665, "grad_norm": 0.35770769766297134, "learning_rate": 3.98826712362739e-05, "loss": 0.6233, "num_tokens": 685629912.0, "step": 164 }, { "epoch": 0.3305785123966942, "grad_norm": 0.43973807991808767, "learning_rate": 3.9879992506341363e-05, "loss": 0.6436, "num_tokens": 689818466.0, "step": 165 }, { "epoch": 0.33258201853243174, "grad_norm": 0.4574167495061099, "learning_rate": 3.987728364383113e-05, "loss": 0.6429, "num_tokens": 693983514.0, "step": 166 }, { "epoch": 0.3345855246681693, "grad_norm": 0.4331572996013807, "learning_rate": 3.987454465330838e-05, "loss": 0.6274, "num_tokens": 698177818.0, "step": 167 }, { "epoch": 0.33658903080390684, "grad_norm": 0.3316964381020293, "learning_rate": 3.9871775539389034e-05, "loss": 0.6399, "num_tokens": 702349929.0, "step": 168 }, { "epoch": 0.33859253693964436, "grad_norm": 0.4484742743633417, "learning_rate": 3.98689763067398e-05, "loss": 0.6289, "num_tokens": 706544233.0, "step": 169 }, { "epoch": 0.34059604307538194, "grad_norm": 0.34857965903413757, "learning_rate": 3.986614696007812e-05, "loss": 0.6495, "num_tokens": 710738537.0, "step": 170 }, { "epoch": 0.34259954921111946, "grad_norm": 0.28175560570000463, "learning_rate": 3.986328750417222e-05, "loss": 0.6454, "num_tokens": 714900821.0, "step": 171 }, { "epoch": 0.344603055346857, "grad_norm": 0.41168093618340845, "learning_rate": 3.986039794384103e-05, "loss": 0.6365, "num_tokens": 719084364.0, "step": 172 }, { "epoch": 0.34660656148259456, "grad_norm": 0.3259136290359811, "learning_rate": 3.985747828395425e-05, "loss": 0.6304, "num_tokens": 723278668.0, "step": 173 }, { "epoch": 0.3486100676183321, "grad_norm": 0.4026582645075551, "learning_rate": 3.9854528529432285e-05, "loss": 0.641, "num_tokens": 727472972.0, "step": 174 }, { "epoch": 0.3506135737540696, "grad_norm": 0.35002465027714663, "learning_rate": 3.9851548685246253e-05, "loss": 0.6332, "num_tokens": 731665491.0, "step": 175 }, { "epoch": 0.3526170798898072, "grad_norm": 0.3574793715617089, "learning_rate": 3.984853875641799e-05, "loss": 0.6414, "num_tokens": 735859795.0, "step": 176 }, { "epoch": 0.3546205860255447, "grad_norm": 0.4011584398082733, "learning_rate": 3.984549874802005e-05, "loss": 0.6458, "num_tokens": 740026273.0, "step": 177 }, { "epoch": 0.35662409216128227, "grad_norm": 0.28874631652066873, "learning_rate": 3.984242866517565e-05, "loss": 0.6385, "num_tokens": 744219711.0, "step": 178 }, { "epoch": 0.3586275982970198, "grad_norm": 0.39832493813878633, "learning_rate": 3.9839328513058705e-05, "loss": 0.6476, "num_tokens": 748414015.0, "step": 179 }, { "epoch": 0.3606311044327573, "grad_norm": 0.6571497269217429, "learning_rate": 3.983619829689379e-05, "loss": 0.6348, "num_tokens": 752608319.0, "step": 180 }, { "epoch": 0.3626346105684949, "grad_norm": 0.4539133792366094, "learning_rate": 3.983303802195617e-05, "loss": 0.6318, "num_tokens": 756770027.0, "step": 181 }, { "epoch": 0.3646381167042324, "grad_norm": 0.8663875517866773, "learning_rate": 3.982984769357175e-05, "loss": 0.6358, "num_tokens": 760944624.0, "step": 182 }, { "epoch": 0.3666416228399699, "grad_norm": 0.9033175741982602, "learning_rate": 3.982662731711709e-05, "loss": 0.6224, "num_tokens": 765138928.0, "step": 183 }, { "epoch": 0.3686451289757075, "grad_norm": 0.28431899918341297, "learning_rate": 3.9823376898019394e-05, "loss": 0.6238, "num_tokens": 769333232.0, "step": 184 }, { "epoch": 0.370648635111445, "grad_norm": 0.5961690872033016, "learning_rate": 3.982009644175647e-05, "loss": 0.6282, "num_tokens": 773527536.0, "step": 185 }, { "epoch": 0.37265214124718254, "grad_norm": 0.5030299089973187, "learning_rate": 3.981678595385679e-05, "loss": 0.6333, "num_tokens": 777721840.0, "step": 186 }, { "epoch": 0.3746556473829201, "grad_norm": 0.38613684617198757, "learning_rate": 3.98134454398994e-05, "loss": 0.6329, "num_tokens": 781916144.0, "step": 187 }, { "epoch": 0.37665915351865764, "grad_norm": 0.35270678794472654, "learning_rate": 3.981007490551396e-05, "loss": 0.627, "num_tokens": 786107586.0, "step": 188 }, { "epoch": 0.3786626596543952, "grad_norm": 0.4061001534021191, "learning_rate": 3.980667435638072e-05, "loss": 0.6225, "num_tokens": 790295195.0, "step": 189 }, { "epoch": 0.38066616579013274, "grad_norm": 0.2999062851277684, "learning_rate": 3.9803243798230524e-05, "loss": 0.6405, "num_tokens": 794489499.0, "step": 190 }, { "epoch": 0.38266967192587026, "grad_norm": 0.4937978547314352, "learning_rate": 3.9799783236844776e-05, "loss": 0.6209, "num_tokens": 798639712.0, "step": 191 }, { "epoch": 0.38467317806160783, "grad_norm": 0.3815106294937793, "learning_rate": 3.979629267805546e-05, "loss": 0.6229, "num_tokens": 802834016.0, "step": 192 }, { "epoch": 0.38667668419734536, "grad_norm": 0.4554544100097785, "learning_rate": 3.979277212774509e-05, "loss": 0.6344, "num_tokens": 807022622.0, "step": 193 }, { "epoch": 0.3886801903330829, "grad_norm": 0.41652930567153046, "learning_rate": 3.978922159184675e-05, "loss": 0.6268, "num_tokens": 811216926.0, "step": 194 }, { "epoch": 0.39068369646882045, "grad_norm": 0.3583019351750032, "learning_rate": 3.978564107634402e-05, "loss": 0.6242, "num_tokens": 815411230.0, "step": 195 }, { "epoch": 0.39268720260455797, "grad_norm": 0.4189500317221142, "learning_rate": 3.978203058727106e-05, "loss": 0.616, "num_tokens": 819578591.0, "step": 196 }, { "epoch": 0.3946907087402955, "grad_norm": 0.3115118107693864, "learning_rate": 3.977839013071248e-05, "loss": 0.6236, "num_tokens": 823766213.0, "step": 197 }, { "epoch": 0.39669421487603307, "grad_norm": 0.4256235117125459, "learning_rate": 3.9774719712803456e-05, "loss": 0.6311, "num_tokens": 827960517.0, "step": 198 }, { "epoch": 0.3986977210117706, "grad_norm": 0.33466090265646053, "learning_rate": 3.977101933972959e-05, "loss": 0.6308, "num_tokens": 832154821.0, "step": 199 }, { "epoch": 0.40070122714750817, "grad_norm": 0.42860442412338035, "learning_rate": 3.9767289017727035e-05, "loss": 0.6453, "num_tokens": 836349125.0, "step": 200 }, { "epoch": 0.4027047332832457, "grad_norm": 0.3646223392162348, "learning_rate": 3.9763528753082366e-05, "loss": 0.6189, "num_tokens": 840519421.0, "step": 201 }, { "epoch": 0.4047082394189832, "grad_norm": 0.33130201439790763, "learning_rate": 3.9759738552132635e-05, "loss": 0.626, "num_tokens": 844653752.0, "step": 202 }, { "epoch": 0.4067117455547208, "grad_norm": 0.29968051961505804, "learning_rate": 3.975591842126536e-05, "loss": 0.6295, "num_tokens": 848848056.0, "step": 203 }, { "epoch": 0.4087152516904583, "grad_norm": 0.26824538161214395, "learning_rate": 3.9752068366918465e-05, "loss": 0.6293, "num_tokens": 853042360.0, "step": 204 }, { "epoch": 0.4107187578261958, "grad_norm": 0.32300821636893273, "learning_rate": 3.974818839558034e-05, "loss": 0.6043, "num_tokens": 857236664.0, "step": 205 }, { "epoch": 0.4127222639619334, "grad_norm": 0.26114910187911466, "learning_rate": 3.974427851378977e-05, "loss": 0.6343, "num_tokens": 861417450.0, "step": 206 }, { "epoch": 0.4147257700976709, "grad_norm": 0.4282383469985268, "learning_rate": 3.974033872813595e-05, "loss": 0.6299, "num_tokens": 865611754.0, "step": 207 }, { "epoch": 0.41672927623340844, "grad_norm": 0.3034645299158751, "learning_rate": 3.973636904525848e-05, "loss": 0.6259, "num_tokens": 869792337.0, "step": 208 }, { "epoch": 0.418732782369146, "grad_norm": 0.3253308042963881, "learning_rate": 3.9732369471847334e-05, "loss": 0.6312, "num_tokens": 873986641.0, "step": 209 }, { "epoch": 0.42073628850488354, "grad_norm": 0.3424406928629939, "learning_rate": 3.972834001464287e-05, "loss": 0.6098, "num_tokens": 878162370.0, "step": 210 }, { "epoch": 0.4227397946406211, "grad_norm": 0.29593594479049334, "learning_rate": 3.97242806804358e-05, "loss": 0.6319, "num_tokens": 882356674.0, "step": 211 }, { "epoch": 0.42474330077635863, "grad_norm": 0.6025326098803494, "learning_rate": 3.9720191476067185e-05, "loss": 0.631, "num_tokens": 886544149.0, "step": 212 }, { "epoch": 0.42674680691209615, "grad_norm": 0.35804911855165494, "learning_rate": 3.9716072408428446e-05, "loss": 0.6236, "num_tokens": 890738453.0, "step": 213 }, { "epoch": 0.42875031304783373, "grad_norm": 0.7312832855816828, "learning_rate": 3.971192348446129e-05, "loss": 0.6265, "num_tokens": 894932757.0, "step": 214 }, { "epoch": 0.43075381918357125, "grad_norm": 0.7346689279964198, "learning_rate": 3.970774471115779e-05, "loss": 0.6308, "num_tokens": 899094537.0, "step": 215 }, { "epoch": 0.43275732531930877, "grad_norm": 0.37428244133208866, "learning_rate": 3.970353609556029e-05, "loss": 0.6227, "num_tokens": 903266310.0, "step": 216 }, { "epoch": 0.43476083145504635, "grad_norm": 0.4051643550665667, "learning_rate": 3.9699297644761425e-05, "loss": 0.6207, "num_tokens": 907460614.0, "step": 217 }, { "epoch": 0.43676433759078387, "grad_norm": 0.3705692162609298, "learning_rate": 3.9695029365904136e-05, "loss": 0.6213, "num_tokens": 911654918.0, "step": 218 }, { "epoch": 0.4387678437265214, "grad_norm": 0.3800008646935558, "learning_rate": 3.96907312661816e-05, "loss": 0.6311, "num_tokens": 915848914.0, "step": 219 }, { "epoch": 0.44077134986225897, "grad_norm": 0.3804079496755877, "learning_rate": 3.968640335283728e-05, "loss": 0.6165, "num_tokens": 920021818.0, "step": 220 }, { "epoch": 0.4427748559979965, "grad_norm": 0.43968077280680495, "learning_rate": 3.9682045633164874e-05, "loss": 0.6327, "num_tokens": 924202707.0, "step": 221 }, { "epoch": 0.44477836213373406, "grad_norm": 0.3760181035216824, "learning_rate": 3.967765811450828e-05, "loss": 0.6277, "num_tokens": 928358524.0, "step": 222 }, { "epoch": 0.4467818682694716, "grad_norm": 0.4648644170632838, "learning_rate": 3.967324080426168e-05, "loss": 0.6228, "num_tokens": 932552828.0, "step": 223 }, { "epoch": 0.4487853744052091, "grad_norm": 0.43584069285630905, "learning_rate": 3.9668793709869395e-05, "loss": 0.6194, "num_tokens": 936747132.0, "step": 224 }, { "epoch": 0.4507888805409467, "grad_norm": 0.4135399826122971, "learning_rate": 3.966431683882599e-05, "loss": 0.612, "num_tokens": 940941436.0, "step": 225 }, { "epoch": 0.4527923866766842, "grad_norm": 0.33556634069287833, "learning_rate": 3.9659810198676175e-05, "loss": 0.6046, "num_tokens": 945135740.0, "step": 226 }, { "epoch": 0.4547958928124217, "grad_norm": 0.3308250073162144, "learning_rate": 3.965527379701485e-05, "loss": 0.6238, "num_tokens": 949330044.0, "step": 227 }, { "epoch": 0.4567993989481593, "grad_norm": 0.28282671023486483, "learning_rate": 3.9650707641487076e-05, "loss": 0.6095, "num_tokens": 953524348.0, "step": 228 }, { "epoch": 0.4588029050838968, "grad_norm": 0.32030865799038233, "learning_rate": 3.964611173978805e-05, "loss": 0.6163, "num_tokens": 957689256.0, "step": 229 }, { "epoch": 0.46080641121963434, "grad_norm": 0.4296553783644718, "learning_rate": 3.964148609966308e-05, "loss": 0.6285, "num_tokens": 961883560.0, "step": 230 }, { "epoch": 0.4628099173553719, "grad_norm": 0.26152489218335445, "learning_rate": 3.963683072890763e-05, "loss": 0.6253, "num_tokens": 966068509.0, "step": 231 }, { "epoch": 0.46481342349110943, "grad_norm": 0.6041736553966052, "learning_rate": 3.963214563536723e-05, "loss": 0.6223, "num_tokens": 970239473.0, "step": 232 }, { "epoch": 0.46681692962684695, "grad_norm": 0.36454627360699815, "learning_rate": 3.962743082693754e-05, "loss": 0.616, "num_tokens": 974409713.0, "step": 233 }, { "epoch": 0.46882043576258453, "grad_norm": 0.6170971139834491, "learning_rate": 3.962268631156425e-05, "loss": 0.6279, "num_tokens": 978604017.0, "step": 234 }, { "epoch": 0.47082394189832205, "grad_norm": 0.4557188339039801, "learning_rate": 3.961791209724315e-05, "loss": 0.6221, "num_tokens": 982793112.0, "step": 235 }, { "epoch": 0.4728274480340596, "grad_norm": 0.5936809732381925, "learning_rate": 3.961310819202007e-05, "loss": 0.6302, "num_tokens": 986987416.0, "step": 236 }, { "epoch": 0.47483095416979715, "grad_norm": 0.437641385894973, "learning_rate": 3.960827460399088e-05, "loss": 0.619, "num_tokens": 991165413.0, "step": 237 }, { "epoch": 0.47683446030553467, "grad_norm": 0.6717722651953256, "learning_rate": 3.960341134130147e-05, "loss": 0.6162, "num_tokens": 995359717.0, "step": 238 }, { "epoch": 0.47883796644127224, "grad_norm": 0.5847284504449202, "learning_rate": 3.959851841214774e-05, "loss": 0.6189, "num_tokens": 999537464.0, "step": 239 }, { "epoch": 0.48084147257700977, "grad_norm": 0.583923054273653, "learning_rate": 3.9593595824775584e-05, "loss": 0.6065, "num_tokens": 1003717181.0, "step": 240 }, { "epoch": 0.4828449787127473, "grad_norm": 0.5814264822578828, "learning_rate": 3.958864358748087e-05, "loss": 0.6297, "num_tokens": 1007911485.0, "step": 241 }, { "epoch": 0.48484848484848486, "grad_norm": 0.5734911228968932, "learning_rate": 3.958366170860947e-05, "loss": 0.6214, "num_tokens": 1012075109.0, "step": 242 }, { "epoch": 0.4868519909842224, "grad_norm": 0.5401438861198599, "learning_rate": 3.957865019655717e-05, "loss": 0.6074, "num_tokens": 1016269413.0, "step": 243 }, { "epoch": 0.4888554971199599, "grad_norm": 0.5191708162715785, "learning_rate": 3.957360905976971e-05, "loss": 0.6191, "num_tokens": 1020463717.0, "step": 244 }, { "epoch": 0.4908590032556975, "grad_norm": 0.49618668197935967, "learning_rate": 3.956853830674276e-05, "loss": 0.6274, "num_tokens": 1024658021.0, "step": 245 }, { "epoch": 0.492862509391435, "grad_norm": 0.4594428988099506, "learning_rate": 3.9563437946021896e-05, "loss": 0.6169, "num_tokens": 1028825530.0, "step": 246 }, { "epoch": 0.4948660155271726, "grad_norm": 0.3859811780610177, "learning_rate": 3.955830798620259e-05, "loss": 0.6117, "num_tokens": 1032997899.0, "step": 247 }, { "epoch": 0.4968695216629101, "grad_norm": 0.5018121611416607, "learning_rate": 3.95531484359302e-05, "loss": 0.6205, "num_tokens": 1037182539.0, "step": 248 }, { "epoch": 0.4988730277986476, "grad_norm": 0.43167504381883703, "learning_rate": 3.954795930389995e-05, "loss": 0.6168, "num_tokens": 1041358415.0, "step": 249 }, { "epoch": 0.5008765339343851, "grad_norm": 0.45573952516881117, "learning_rate": 3.954274059885692e-05, "loss": 0.634, "num_tokens": 1045552719.0, "step": 250 }, { "epoch": 0.5028800400701228, "grad_norm": 0.3708490665068598, "learning_rate": 3.953749232959603e-05, "loss": 0.6214, "num_tokens": 1049747023.0, "step": 251 }, { "epoch": 0.5048835462058603, "grad_norm": 0.47771084866990304, "learning_rate": 3.953221450496201e-05, "loss": 0.6109, "num_tokens": 1053941327.0, "step": 252 }, { "epoch": 0.5068870523415978, "grad_norm": 0.28192393971483787, "learning_rate": 3.952690713384941e-05, "loss": 0.618, "num_tokens": 1058119150.0, "step": 253 }, { "epoch": 0.5088905584773353, "grad_norm": 0.5991663955054002, "learning_rate": 3.952157022520258e-05, "loss": 0.6168, "num_tokens": 1062288841.0, "step": 254 }, { "epoch": 0.5108940646130729, "grad_norm": 0.5205246017576622, "learning_rate": 3.951620378801563e-05, "loss": 0.613, "num_tokens": 1066483145.0, "step": 255 }, { "epoch": 0.5128975707488104, "grad_norm": 0.3974565177292702, "learning_rate": 3.951080783133245e-05, "loss": 0.6204, "num_tokens": 1070677449.0, "step": 256 }, { "epoch": 0.514901076884548, "grad_norm": 0.5053061010921718, "learning_rate": 3.950538236424668e-05, "loss": 0.6163, "num_tokens": 1074844632.0, "step": 257 }, { "epoch": 0.5169045830202855, "grad_norm": 0.3908651436056998, "learning_rate": 3.949992739590168e-05, "loss": 0.6158, "num_tokens": 1079020552.0, "step": 258 }, { "epoch": 0.518908089156023, "grad_norm": 0.37828759364031656, "learning_rate": 3.949444293549053e-05, "loss": 0.6258, "num_tokens": 1083214856.0, "step": 259 }, { "epoch": 0.5209115952917606, "grad_norm": 0.3932581151981951, "learning_rate": 3.948892899225602e-05, "loss": 0.6214, "num_tokens": 1087409160.0, "step": 260 }, { "epoch": 0.5229151014274981, "grad_norm": 0.30294823329076803, "learning_rate": 3.9483385575490604e-05, "loss": 0.6126, "num_tokens": 1091603464.0, "step": 261 }, { "epoch": 0.5249186075632357, "grad_norm": 0.32317097714541576, "learning_rate": 3.947781269453645e-05, "loss": 0.6063, "num_tokens": 1095797768.0, "step": 262 }, { "epoch": 0.5269221136989732, "grad_norm": 0.4665203462610598, "learning_rate": 3.947221035878535e-05, "loss": 0.6181, "num_tokens": 1099973109.0, "step": 263 }, { "epoch": 0.5289256198347108, "grad_norm": 0.23039887740623216, "learning_rate": 3.9466578577678726e-05, "loss": 0.623, "num_tokens": 1104149428.0, "step": 264 }, { "epoch": 0.5309291259704483, "grad_norm": 0.739093136000099, "learning_rate": 3.946091736070765e-05, "loss": 0.6043, "num_tokens": 1108330972.0, "step": 265 }, { "epoch": 0.5329326321061858, "grad_norm": 0.6884144846849021, "learning_rate": 3.9455226717412795e-05, "loss": 0.6143, "num_tokens": 1112525276.0, "step": 266 }, { "epoch": 0.5349361382419233, "grad_norm": 0.38834315140228076, "learning_rate": 3.94495066573844e-05, "loss": 0.6099, "num_tokens": 1116715654.0, "step": 267 }, { "epoch": 0.536939644377661, "grad_norm": 0.4549562515895791, "learning_rate": 3.9443757190262315e-05, "loss": 0.5992, "num_tokens": 1120894408.0, "step": 268 }, { "epoch": 0.5389431505133985, "grad_norm": 0.3864202985368748, "learning_rate": 3.943797832573593e-05, "loss": 0.6132, "num_tokens": 1125087018.0, "step": 269 }, { "epoch": 0.540946656649136, "grad_norm": 0.3682086713757355, "learning_rate": 3.943217007354417e-05, "loss": 0.612, "num_tokens": 1129231581.0, "step": 270 }, { "epoch": 0.5429501627848735, "grad_norm": 0.33237914782780076, "learning_rate": 3.942633244347551e-05, "loss": 0.6073, "num_tokens": 1133382897.0, "step": 271 }, { "epoch": 0.544953668920611, "grad_norm": 0.2805314080756397, "learning_rate": 3.942046544536791e-05, "loss": 0.5987, "num_tokens": 1137572900.0, "step": 272 }, { "epoch": 0.5469571750563486, "grad_norm": 0.38260978683588626, "learning_rate": 3.941456908910884e-05, "loss": 0.6111, "num_tokens": 1141767204.0, "step": 273 }, { "epoch": 0.5489606811920862, "grad_norm": 0.26149447342102866, "learning_rate": 3.9408643384635234e-05, "loss": 0.6032, "num_tokens": 1145896020.0, "step": 274 }, { "epoch": 0.5509641873278237, "grad_norm": 0.4901418571451076, "learning_rate": 3.9402688341933494e-05, "loss": 0.6155, "num_tokens": 1150090324.0, "step": 275 }, { "epoch": 0.5529676934635612, "grad_norm": 0.39411112617418403, "learning_rate": 3.939670397103947e-05, "loss": 0.6066, "num_tokens": 1154271180.0, "step": 276 }, { "epoch": 0.5549711995992987, "grad_norm": 0.47494253581058127, "learning_rate": 3.939069028203842e-05, "loss": 0.6136, "num_tokens": 1158465484.0, "step": 277 }, { "epoch": 0.5569747057350363, "grad_norm": 0.4022714084507304, "learning_rate": 3.9384647285065026e-05, "loss": 0.6156, "num_tokens": 1162654162.0, "step": 278 }, { "epoch": 0.5589782118707739, "grad_norm": 0.40529715152422396, "learning_rate": 3.937857499030336e-05, "loss": 0.5986, "num_tokens": 1166848466.0, "step": 279 }, { "epoch": 0.5609817180065114, "grad_norm": 0.35661582484458165, "learning_rate": 3.937247340798686e-05, "loss": 0.6107, "num_tokens": 1171042770.0, "step": 280 }, { "epoch": 0.5629852241422489, "grad_norm": 0.4035059883425247, "learning_rate": 3.936634254839832e-05, "loss": 0.6233, "num_tokens": 1175231940.0, "step": 281 }, { "epoch": 0.5649887302779865, "grad_norm": 0.3624510035651207, "learning_rate": 3.9360182421869896e-05, "loss": 0.6193, "num_tokens": 1179410475.0, "step": 282 }, { "epoch": 0.566992236413724, "grad_norm": 0.4236686399756323, "learning_rate": 3.935399303878305e-05, "loss": 0.6044, "num_tokens": 1183588951.0, "step": 283 }, { "epoch": 0.5689957425494615, "grad_norm": 0.389439731857124, "learning_rate": 3.9347774409568536e-05, "loss": 0.6228, "num_tokens": 1187783255.0, "step": 284 }, { "epoch": 0.5709992486851991, "grad_norm": 0.36599400487983175, "learning_rate": 3.934152654470642e-05, "loss": 0.6059, "num_tokens": 1191977559.0, "step": 285 }, { "epoch": 0.5730027548209367, "grad_norm": 0.27838793002089984, "learning_rate": 3.9335249454726036e-05, "loss": 0.6053, "num_tokens": 1196171863.0, "step": 286 }, { "epoch": 0.5750062609566742, "grad_norm": 0.40867962017664916, "learning_rate": 3.932894315020595e-05, "loss": 0.6008, "num_tokens": 1200361824.0, "step": 287 }, { "epoch": 0.5770097670924117, "grad_norm": 0.30905056335876213, "learning_rate": 3.932260764177398e-05, "loss": 0.6139, "num_tokens": 1204503620.0, "step": 288 }, { "epoch": 0.5790132732281492, "grad_norm": 0.4501785994794995, "learning_rate": 3.931624294010716e-05, "loss": 0.6223, "num_tokens": 1208697924.0, "step": 289 }, { "epoch": 0.5810167793638868, "grad_norm": 0.40251425745266745, "learning_rate": 3.93098490559317e-05, "loss": 0.6161, "num_tokens": 1212873258.0, "step": 290 }, { "epoch": 0.5830202854996244, "grad_norm": 0.36244206994346256, "learning_rate": 3.930342600002303e-05, "loss": 0.6021, "num_tokens": 1217067562.0, "step": 291 }, { "epoch": 0.5850237916353619, "grad_norm": 0.3780453728164347, "learning_rate": 3.929697378320571e-05, "loss": 0.6089, "num_tokens": 1221261866.0, "step": 292 }, { "epoch": 0.5870272977710994, "grad_norm": 0.31795637937608157, "learning_rate": 3.929049241635344e-05, "loss": 0.612, "num_tokens": 1225434834.0, "step": 293 }, { "epoch": 0.5890308039068369, "grad_norm": 0.3339252624858417, "learning_rate": 3.928398191038908e-05, "loss": 0.6022, "num_tokens": 1229629138.0, "step": 294 }, { "epoch": 0.5910343100425745, "grad_norm": 0.3167877878699543, "learning_rate": 3.9277442276284576e-05, "loss": 0.6162, "num_tokens": 1233823442.0, "step": 295 }, { "epoch": 0.5930378161783121, "grad_norm": 0.2764570534821325, "learning_rate": 3.927087352506096e-05, "loss": 0.6097, "num_tokens": 1238006999.0, "step": 296 }, { "epoch": 0.5950413223140496, "grad_norm": 0.25290255162241937, "learning_rate": 3.926427566778832e-05, "loss": 0.5818, "num_tokens": 1242201303.0, "step": 297 }, { "epoch": 0.5970448284497871, "grad_norm": 0.21616893099320736, "learning_rate": 3.925764871558584e-05, "loss": 0.6079, "num_tokens": 1246392450.0, "step": 298 }, { "epoch": 0.5990483345855246, "grad_norm": 0.3816570537695835, "learning_rate": 3.92509926796217e-05, "loss": 0.6078, "num_tokens": 1250586754.0, "step": 299 }, { "epoch": 0.6010518407212622, "grad_norm": 0.2592094603454731, "learning_rate": 3.9244307571113095e-05, "loss": 0.6151, "num_tokens": 1254737489.0, "step": 300 }, { "epoch": 0.6030553468569998, "grad_norm": 0.41870770679948405, "learning_rate": 3.9237593401326236e-05, "loss": 0.6076, "num_tokens": 1258931793.0, "step": 301 }, { "epoch": 0.6050588529927373, "grad_norm": 0.3381078694153498, "learning_rate": 3.923085018157629e-05, "loss": 0.6003, "num_tokens": 1263126097.0, "step": 302 }, { "epoch": 0.6070623591284748, "grad_norm": 0.31684536398649055, "learning_rate": 3.9224077923227396e-05, "loss": 0.6073, "num_tokens": 1267310104.0, "step": 303 }, { "epoch": 0.6090658652642124, "grad_norm": 0.2620489862738512, "learning_rate": 3.921727663769261e-05, "loss": 0.5961, "num_tokens": 1271475422.0, "step": 304 }, { "epoch": 0.6110693713999499, "grad_norm": 0.2825582617842015, "learning_rate": 3.921044633643394e-05, "loss": 0.6125, "num_tokens": 1275647589.0, "step": 305 }, { "epoch": 0.6130728775356874, "grad_norm": 0.2570578554602573, "learning_rate": 3.920358703096226e-05, "loss": 0.6164, "num_tokens": 1279824548.0, "step": 306 }, { "epoch": 0.615076383671425, "grad_norm": 0.35301657856159985, "learning_rate": 3.9196698732837346e-05, "loss": 0.6097, "num_tokens": 1284018852.0, "step": 307 }, { "epoch": 0.6170798898071626, "grad_norm": 0.2400140997649322, "learning_rate": 3.9189781453667814e-05, "loss": 0.6039, "num_tokens": 1288213156.0, "step": 308 }, { "epoch": 0.6190833959429001, "grad_norm": 0.4051121986358227, "learning_rate": 3.918283520511114e-05, "loss": 0.6055, "num_tokens": 1292407460.0, "step": 309 }, { "epoch": 0.6210869020786376, "grad_norm": 0.3225007798122433, "learning_rate": 3.917585999887361e-05, "loss": 0.603, "num_tokens": 1296593344.0, "step": 310 }, { "epoch": 0.6230904082143751, "grad_norm": 0.3271500863516042, "learning_rate": 3.916885584671032e-05, "loss": 0.6077, "num_tokens": 1300773742.0, "step": 311 }, { "epoch": 0.6250939143501127, "grad_norm": 0.35998686494044474, "learning_rate": 3.916182276042515e-05, "loss": 0.6115, "num_tokens": 1304968046.0, "step": 312 }, { "epoch": 0.6270974204858503, "grad_norm": 0.24485481701221962, "learning_rate": 3.915476075187072e-05, "loss": 0.5963, "num_tokens": 1309162350.0, "step": 313 }, { "epoch": 0.6291009266215878, "grad_norm": 0.3234133735285083, "learning_rate": 3.914766983294841e-05, "loss": 0.6071, "num_tokens": 1313332745.0, "step": 314 }, { "epoch": 0.6311044327573253, "grad_norm": 0.27180075662062286, "learning_rate": 3.914055001560832e-05, "loss": 0.6039, "num_tokens": 1317506578.0, "step": 315 }, { "epoch": 0.6331079388930628, "grad_norm": 0.37954340272030046, "learning_rate": 3.913340131184925e-05, "loss": 0.606, "num_tokens": 1321700882.0, "step": 316 }, { "epoch": 0.6351114450288003, "grad_norm": 0.3431409755765106, "learning_rate": 3.912622373371869e-05, "loss": 0.6057, "num_tokens": 1325892502.0, "step": 317 }, { "epoch": 0.637114951164538, "grad_norm": 0.3105670215292682, "learning_rate": 3.911901729331277e-05, "loss": 0.6091, "num_tokens": 1330030305.0, "step": 318 }, { "epoch": 0.6391184573002755, "grad_norm": 0.45540503787786135, "learning_rate": 3.911178200277627e-05, "loss": 0.6193, "num_tokens": 1334224609.0, "step": 319 }, { "epoch": 0.641121963436013, "grad_norm": 0.26844684753034237, "learning_rate": 3.91045178743026e-05, "loss": 0.614, "num_tokens": 1338418913.0, "step": 320 }, { "epoch": 0.6431254695717505, "grad_norm": 0.40588533599952126, "learning_rate": 3.909722492013377e-05, "loss": 0.6071, "num_tokens": 1342593114.0, "step": 321 }, { "epoch": 0.6451289757074881, "grad_norm": 0.3610970244813676, "learning_rate": 3.9089903152560346e-05, "loss": 0.6126, "num_tokens": 1346787418.0, "step": 322 }, { "epoch": 0.6471324818432257, "grad_norm": 0.30061271397443984, "learning_rate": 3.9082552583921474e-05, "loss": 0.6177, "num_tokens": 1350967443.0, "step": 323 }, { "epoch": 0.6491359879789632, "grad_norm": 0.3731109557869287, "learning_rate": 3.907517322660484e-05, "loss": 0.5996, "num_tokens": 1355161747.0, "step": 324 }, { "epoch": 0.6511394941147007, "grad_norm": 0.2695216810355298, "learning_rate": 3.906776509304663e-05, "loss": 0.5983, "num_tokens": 1359356051.0, "step": 325 }, { "epoch": 0.6531430002504383, "grad_norm": 0.278380785041166, "learning_rate": 3.906032819573153e-05, "loss": 0.6174, "num_tokens": 1363550355.0, "step": 326 }, { "epoch": 0.6551465063861758, "grad_norm": 0.3629726194800412, "learning_rate": 3.905286254719271e-05, "loss": 0.5954, "num_tokens": 1367744659.0, "step": 327 }, { "epoch": 0.6571500125219133, "grad_norm": 0.265917482768394, "learning_rate": 3.90453681600118e-05, "loss": 0.6057, "num_tokens": 1371938963.0, "step": 328 }, { "epoch": 0.6591535186576509, "grad_norm": 0.38255666924891507, "learning_rate": 3.903784504681882e-05, "loss": 0.6036, "num_tokens": 1376133267.0, "step": 329 }, { "epoch": 0.6611570247933884, "grad_norm": 0.389763999256542, "learning_rate": 3.903029322029226e-05, "loss": 0.5964, "num_tokens": 1380317890.0, "step": 330 }, { "epoch": 0.663160530929126, "grad_norm": 0.29530463596641393, "learning_rate": 3.902271269315895e-05, "loss": 0.6063, "num_tokens": 1384498703.0, "step": 331 }, { "epoch": 0.6651640370648635, "grad_norm": 0.40490209246831743, "learning_rate": 3.9015103478194124e-05, "loss": 0.6006, "num_tokens": 1388693007.0, "step": 332 }, { "epoch": 0.667167543200601, "grad_norm": 0.26487388623808683, "learning_rate": 3.900746558822135e-05, "loss": 0.6058, "num_tokens": 1392861675.0, "step": 333 }, { "epoch": 0.6691710493363386, "grad_norm": 0.46641162373637396, "learning_rate": 3.89997990361125e-05, "loss": 0.6033, "num_tokens": 1397044546.0, "step": 334 }, { "epoch": 0.6711745554720762, "grad_norm": 0.27485833017491595, "learning_rate": 3.899210383478778e-05, "loss": 0.5893, "num_tokens": 1401236054.0, "step": 335 }, { "epoch": 0.6731780616078137, "grad_norm": 0.49933692973486793, "learning_rate": 3.898437999721568e-05, "loss": 0.601, "num_tokens": 1405415779.0, "step": 336 }, { "epoch": 0.6751815677435512, "grad_norm": 0.37524291489743966, "learning_rate": 3.897662753641291e-05, "loss": 0.6058, "num_tokens": 1409578099.0, "step": 337 }, { "epoch": 0.6771850738792887, "grad_norm": 0.5440979631562645, "learning_rate": 3.896884646544446e-05, "loss": 0.6067, "num_tokens": 1413759460.0, "step": 338 }, { "epoch": 0.6791885800150262, "grad_norm": 0.5132861007042655, "learning_rate": 3.896103679742351e-05, "loss": 0.5999, "num_tokens": 1417953764.0, "step": 339 }, { "epoch": 0.6811920861507639, "grad_norm": 0.4285131333362848, "learning_rate": 3.895319854551145e-05, "loss": 0.6117, "num_tokens": 1422148068.0, "step": 340 }, { "epoch": 0.6831955922865014, "grad_norm": 0.4217683594115467, "learning_rate": 3.894533172291783e-05, "loss": 0.5989, "num_tokens": 1426311697.0, "step": 341 }, { "epoch": 0.6851990984222389, "grad_norm": 0.4224524795938551, "learning_rate": 3.893743634290035e-05, "loss": 0.6089, "num_tokens": 1430506001.0, "step": 342 }, { "epoch": 0.6872026045579764, "grad_norm": 0.40395165703369307, "learning_rate": 3.8929512418764856e-05, "loss": 0.6068, "num_tokens": 1434668968.0, "step": 343 }, { "epoch": 0.689206110693714, "grad_norm": 0.4120146002865543, "learning_rate": 3.8921559963865264e-05, "loss": 0.5843, "num_tokens": 1438863272.0, "step": 344 }, { "epoch": 0.6912096168294516, "grad_norm": 0.4149207330703223, "learning_rate": 3.89135789916036e-05, "loss": 0.603, "num_tokens": 1443026386.0, "step": 345 }, { "epoch": 0.6932131229651891, "grad_norm": 0.37828693500459193, "learning_rate": 3.890556951542995e-05, "loss": 0.5803, "num_tokens": 1447220690.0, "step": 346 }, { "epoch": 0.6952166291009266, "grad_norm": 0.35969544270083664, "learning_rate": 3.889753154884241e-05, "loss": 0.6067, "num_tokens": 1451351979.0, "step": 347 }, { "epoch": 0.6972201352366642, "grad_norm": 0.3652342369715886, "learning_rate": 3.8889465105387126e-05, "loss": 0.6081, "num_tokens": 1455538821.0, "step": 348 }, { "epoch": 0.6992236413724017, "grad_norm": 0.3273182356265898, "learning_rate": 3.8881370198658196e-05, "loss": 0.6055, "num_tokens": 1459724132.0, "step": 349 }, { "epoch": 0.7012271475081392, "grad_norm": 0.34285419717241883, "learning_rate": 3.887324684229771e-05, "loss": 0.5841, "num_tokens": 1463900771.0, "step": 350 }, { "epoch": 0.7032306536438768, "grad_norm": 0.2779471167755664, "learning_rate": 3.8865095049995714e-05, "loss": 0.6054, "num_tokens": 1468069783.0, "step": 351 }, { "epoch": 0.7052341597796143, "grad_norm": 0.31425219765959994, "learning_rate": 3.8856914835490154e-05, "loss": 0.5947, "num_tokens": 1472264087.0, "step": 352 }, { "epoch": 0.7072376659153519, "grad_norm": 0.22621780735753347, "learning_rate": 3.884870621256688e-05, "loss": 0.5967, "num_tokens": 1476458391.0, "step": 353 }, { "epoch": 0.7092411720510894, "grad_norm": 0.3006046296994218, "learning_rate": 3.884046919505962e-05, "loss": 0.5892, "num_tokens": 1480644434.0, "step": 354 }, { "epoch": 0.7112446781868269, "grad_norm": 0.31276493300009356, "learning_rate": 3.8832203796849964e-05, "loss": 0.6007, "num_tokens": 1484807749.0, "step": 355 }, { "epoch": 0.7132481843225645, "grad_norm": 0.2950236631296911, "learning_rate": 3.882391003186732e-05, "loss": 0.5946, "num_tokens": 1489002053.0, "step": 356 }, { "epoch": 0.7152516904583021, "grad_norm": 0.2861992405001696, "learning_rate": 3.88155879140889e-05, "loss": 0.5855, "num_tokens": 1493182905.0, "step": 357 }, { "epoch": 0.7172551965940396, "grad_norm": 0.32471966504587324, "learning_rate": 3.8807237457539705e-05, "loss": 0.6045, "num_tokens": 1497377209.0, "step": 358 }, { "epoch": 0.7192587027297771, "grad_norm": 0.2541509386234667, "learning_rate": 3.879885867629248e-05, "loss": 0.595, "num_tokens": 1501556558.0, "step": 359 }, { "epoch": 0.7212622088655146, "grad_norm": 0.4062924640437344, "learning_rate": 3.879045158446773e-05, "loss": 0.606, "num_tokens": 1505750862.0, "step": 360 }, { "epoch": 0.7232657150012521, "grad_norm": 0.3489900058642136, "learning_rate": 3.878201619623366e-05, "loss": 0.6013, "num_tokens": 1509945166.0, "step": 361 }, { "epoch": 0.7252692211369898, "grad_norm": 0.35445811078418965, "learning_rate": 3.8773552525806144e-05, "loss": 0.6015, "num_tokens": 1514107179.0, "step": 362 }, { "epoch": 0.7272727272727273, "grad_norm": 0.3634938452347441, "learning_rate": 3.8765060587448747e-05, "loss": 0.6006, "num_tokens": 1518282403.0, "step": 363 }, { "epoch": 0.7292762334084648, "grad_norm": 0.2987664478110601, "learning_rate": 3.875654039547266e-05, "loss": 0.5881, "num_tokens": 1522462317.0, "step": 364 }, { "epoch": 0.7312797395442023, "grad_norm": 0.30421815794001095, "learning_rate": 3.874799196423668e-05, "loss": 0.6047, "num_tokens": 1526656621.0, "step": 365 }, { "epoch": 0.7332832456799399, "grad_norm": 0.28344115447593027, "learning_rate": 3.87394153081472e-05, "loss": 0.607, "num_tokens": 1530850925.0, "step": 366 }, { "epoch": 0.7352867518156775, "grad_norm": 0.27672647890304986, "learning_rate": 3.8730810441658204e-05, "loss": 0.6006, "num_tokens": 1535045229.0, "step": 367 }, { "epoch": 0.737290257951415, "grad_norm": 0.24020749544295755, "learning_rate": 3.872217737927118e-05, "loss": 0.6125, "num_tokens": 1539235151.0, "step": 368 }, { "epoch": 0.7392937640871525, "grad_norm": 0.31660868829727873, "learning_rate": 3.871351613553516e-05, "loss": 0.5922, "num_tokens": 1543399505.0, "step": 369 }, { "epoch": 0.74129727022289, "grad_norm": 0.2182884214476191, "learning_rate": 3.870482672504666e-05, "loss": 0.5866, "num_tokens": 1547570463.0, "step": 370 }, { "epoch": 0.7433007763586276, "grad_norm": 0.4485543855659915, "learning_rate": 3.869610916244966e-05, "loss": 0.5996, "num_tokens": 1551764767.0, "step": 371 }, { "epoch": 0.7453042824943651, "grad_norm": 0.28966963294008463, "learning_rate": 3.868736346243558e-05, "loss": 0.5898, "num_tokens": 1555959071.0, "step": 372 }, { "epoch": 0.7473077886301027, "grad_norm": 0.4786584379247093, "learning_rate": 3.8678589639743286e-05, "loss": 0.5863, "num_tokens": 1560153375.0, "step": 373 }, { "epoch": 0.7493112947658402, "grad_norm": 0.44486947816395583, "learning_rate": 3.8669787709159014e-05, "loss": 0.5961, "num_tokens": 1564347679.0, "step": 374 }, { "epoch": 0.7513148009015778, "grad_norm": 0.3741521254818676, "learning_rate": 3.866095768551637e-05, "loss": 0.6134, "num_tokens": 1568541983.0, "step": 375 }, { "epoch": 0.7533183070373153, "grad_norm": 0.38360820725654776, "learning_rate": 3.865209958369631e-05, "loss": 0.6043, "num_tokens": 1572736287.0, "step": 376 }, { "epoch": 0.7553218131730528, "grad_norm": 0.35384482171308684, "learning_rate": 3.864321341862711e-05, "loss": 0.5968, "num_tokens": 1576906871.0, "step": 377 }, { "epoch": 0.7573253193087904, "grad_norm": 0.34941446788407043, "learning_rate": 3.8634299205284345e-05, "loss": 0.5865, "num_tokens": 1581080315.0, "step": 378 }, { "epoch": 0.759328825444528, "grad_norm": 0.30053297169565113, "learning_rate": 3.862535695869085e-05, "loss": 0.5921, "num_tokens": 1585274619.0, "step": 379 }, { "epoch": 0.7613323315802655, "grad_norm": 0.35112638982527544, "learning_rate": 3.86163866939167e-05, "loss": 0.598, "num_tokens": 1589453495.0, "step": 380 }, { "epoch": 0.763335837716003, "grad_norm": 0.2916439947784345, "learning_rate": 3.8607388426079195e-05, "loss": 0.5994, "num_tokens": 1593647799.0, "step": 381 }, { "epoch": 0.7653393438517405, "grad_norm": 0.3295825323876031, "learning_rate": 3.859836217034283e-05, "loss": 0.598, "num_tokens": 1597833780.0, "step": 382 }, { "epoch": 0.767342849987478, "grad_norm": 0.3031180259801727, "learning_rate": 3.8589307941919264e-05, "loss": 0.6113, "num_tokens": 1602028084.0, "step": 383 }, { "epoch": 0.7693463561232157, "grad_norm": 0.2755784097374644, "learning_rate": 3.858022575606731e-05, "loss": 0.5829, "num_tokens": 1606222388.0, "step": 384 }, { "epoch": 0.7713498622589532, "grad_norm": 0.35633145965336616, "learning_rate": 3.857111562809286e-05, "loss": 0.5804, "num_tokens": 1610416692.0, "step": 385 }, { "epoch": 0.7733533683946907, "grad_norm": 0.2675951991945383, "learning_rate": 3.8561977573348936e-05, "loss": 0.601, "num_tokens": 1614605532.0, "step": 386 }, { "epoch": 0.7753568745304282, "grad_norm": 0.32320105318467823, "learning_rate": 3.855281160723561e-05, "loss": 0.6023, "num_tokens": 1618799836.0, "step": 387 }, { "epoch": 0.7773603806661658, "grad_norm": 0.2628473495512423, "learning_rate": 3.8543617745199985e-05, "loss": 0.5875, "num_tokens": 1622993590.0, "step": 388 }, { "epoch": 0.7793638868019034, "grad_norm": 0.28617119215882475, "learning_rate": 3.853439600273617e-05, "loss": 0.6069, "num_tokens": 1627180889.0, "step": 389 }, { "epoch": 0.7813673929376409, "grad_norm": 0.2543565122679629, "learning_rate": 3.85251463953853e-05, "loss": 0.5865, "num_tokens": 1631350997.0, "step": 390 }, { "epoch": 0.7833708990733784, "grad_norm": 0.2565502262251373, "learning_rate": 3.851586893873542e-05, "loss": 0.5881, "num_tokens": 1635536901.0, "step": 391 }, { "epoch": 0.7853744052091159, "grad_norm": 0.26723047844392767, "learning_rate": 3.8506563648421536e-05, "loss": 0.5941, "num_tokens": 1639731205.0, "step": 392 }, { "epoch": 0.7873779113448535, "grad_norm": 0.2900526703862786, "learning_rate": 3.849723054012556e-05, "loss": 0.568, "num_tokens": 1643925509.0, "step": 393 }, { "epoch": 0.789381417480591, "grad_norm": 0.3680000966802013, "learning_rate": 3.848786962957626e-05, "loss": 0.5969, "num_tokens": 1648119813.0, "step": 394 }, { "epoch": 0.7913849236163286, "grad_norm": 0.2076088671153757, "learning_rate": 3.847848093254931e-05, "loss": 0.5877, "num_tokens": 1652305969.0, "step": 395 }, { "epoch": 0.7933884297520661, "grad_norm": 0.6369917950372805, "learning_rate": 3.846906446486716e-05, "loss": 0.5949, "num_tokens": 1656495154.0, "step": 396 }, { "epoch": 0.7953919358878037, "grad_norm": 0.5320060970842035, "learning_rate": 3.8459620242399086e-05, "loss": 0.592, "num_tokens": 1660682282.0, "step": 397 }, { "epoch": 0.7973954420235412, "grad_norm": 0.4465984913063304, "learning_rate": 3.845014828106112e-05, "loss": 0.5997, "num_tokens": 1664860176.0, "step": 398 }, { "epoch": 0.7993989481592787, "grad_norm": 0.40903059824567545, "learning_rate": 3.844064859681608e-05, "loss": 0.5911, "num_tokens": 1669042722.0, "step": 399 }, { "epoch": 0.8014024542950163, "grad_norm": 0.450104283165802, "learning_rate": 3.843112120567345e-05, "loss": 0.5876, "num_tokens": 1673220075.0, "step": 400 }, { "epoch": 0.8034059604307539, "grad_norm": 0.27855301397733195, "learning_rate": 3.8421566123689455e-05, "loss": 0.587, "num_tokens": 1677344465.0, "step": 401 }, { "epoch": 0.8054094665664914, "grad_norm": 0.574845952280324, "learning_rate": 3.841198336696697e-05, "loss": 0.5911, "num_tokens": 1681538769.0, "step": 402 }, { "epoch": 0.8074129727022289, "grad_norm": 0.4705240866085387, "learning_rate": 3.84023729516555e-05, "loss": 0.5969, "num_tokens": 1685723792.0, "step": 403 }, { "epoch": 0.8094164788379664, "grad_norm": 0.535253506342311, "learning_rate": 3.839273489395117e-05, "loss": 0.5848, "num_tokens": 1689918096.0, "step": 404 }, { "epoch": 0.8114199849737039, "grad_norm": 0.4532592787548968, "learning_rate": 3.8383069210096694e-05, "loss": 0.588, "num_tokens": 1694112400.0, "step": 405 }, { "epoch": 0.8134234911094416, "grad_norm": 0.5290483396680136, "learning_rate": 3.8373375916381336e-05, "loss": 0.5981, "num_tokens": 1698301396.0, "step": 406 }, { "epoch": 0.8154269972451791, "grad_norm": 0.5086479959941619, "learning_rate": 3.8363655029140885e-05, "loss": 0.5805, "num_tokens": 1702495700.0, "step": 407 }, { "epoch": 0.8174305033809166, "grad_norm": 0.3370894611862267, "learning_rate": 3.8353906564757654e-05, "loss": 0.607, "num_tokens": 1706690004.0, "step": 408 }, { "epoch": 0.8194340095166541, "grad_norm": 0.40811763178013766, "learning_rate": 3.834413053966041e-05, "loss": 0.6093, "num_tokens": 1710884308.0, "step": 409 }, { "epoch": 0.8214375156523916, "grad_norm": 0.35139997922121124, "learning_rate": 3.8334326970324374e-05, "loss": 0.5914, "num_tokens": 1715078612.0, "step": 410 }, { "epoch": 0.8234410217881293, "grad_norm": 0.27157360565203237, "learning_rate": 3.83244958732712e-05, "loss": 0.5929, "num_tokens": 1719260978.0, "step": 411 }, { "epoch": 0.8254445279238668, "grad_norm": 0.42250012277940235, "learning_rate": 3.831463726506889e-05, "loss": 0.6024, "num_tokens": 1723454606.0, "step": 412 }, { "epoch": 0.8274480340596043, "grad_norm": 0.2638334706249561, "learning_rate": 3.830475116233187e-05, "loss": 0.5917, "num_tokens": 1727627440.0, "step": 413 }, { "epoch": 0.8294515401953418, "grad_norm": 0.5057113669039587, "learning_rate": 3.829483758172086e-05, "loss": 0.6093, "num_tokens": 1731821744.0, "step": 414 }, { "epoch": 0.8314550463310794, "grad_norm": 0.36452569012611347, "learning_rate": 3.82848965399429e-05, "loss": 0.5991, "num_tokens": 1736016048.0, "step": 415 }, { "epoch": 0.8334585524668169, "grad_norm": 0.4887931704828934, "learning_rate": 3.8274928053751306e-05, "loss": 0.5921, "num_tokens": 1740168425.0, "step": 416 }, { "epoch": 0.8354620586025545, "grad_norm": 0.42114609950358917, "learning_rate": 3.826493213994565e-05, "loss": 0.5825, "num_tokens": 1744362729.0, "step": 417 }, { "epoch": 0.837465564738292, "grad_norm": 0.39335782473911035, "learning_rate": 3.825490881537173e-05, "loss": 0.5751, "num_tokens": 1748545031.0, "step": 418 }, { "epoch": 0.8394690708740296, "grad_norm": 0.40854066012240126, "learning_rate": 3.824485809692153e-05, "loss": 0.5915, "num_tokens": 1752739335.0, "step": 419 }, { "epoch": 0.8414725770097671, "grad_norm": 0.402110279425457, "learning_rate": 3.82347800015332e-05, "loss": 0.5917, "num_tokens": 1756915895.0, "step": 420 }, { "epoch": 0.8434760831455046, "grad_norm": 0.36363683010904896, "learning_rate": 3.822467454619103e-05, "loss": 0.5887, "num_tokens": 1761084100.0, "step": 421 }, { "epoch": 0.8454795892812422, "grad_norm": 0.3807527156779734, "learning_rate": 3.821454174792543e-05, "loss": 0.5989, "num_tokens": 1765278404.0, "step": 422 }, { "epoch": 0.8474830954169797, "grad_norm": 0.3768517292469064, "learning_rate": 3.820438162381286e-05, "loss": 0.5985, "num_tokens": 1769458241.0, "step": 423 }, { "epoch": 0.8494866015527173, "grad_norm": 0.37588768379815535, "learning_rate": 3.819419419097587e-05, "loss": 0.5979, "num_tokens": 1773627516.0, "step": 424 }, { "epoch": 0.8514901076884548, "grad_norm": 0.3069424950106743, "learning_rate": 3.818397946658301e-05, "loss": 0.5829, "num_tokens": 1777821820.0, "step": 425 }, { "epoch": 0.8534936138241923, "grad_norm": 0.3539708212411567, "learning_rate": 3.817373746784882e-05, "loss": 0.5909, "num_tokens": 1781988713.0, "step": 426 }, { "epoch": 0.8554971199599298, "grad_norm": 0.30355859191235807, "learning_rate": 3.816346821203382e-05, "loss": 0.5799, "num_tokens": 1786183017.0, "step": 427 }, { "epoch": 0.8575006260956675, "grad_norm": 0.36247301641341906, "learning_rate": 3.8153171716444455e-05, "loss": 0.5854, "num_tokens": 1790377321.0, "step": 428 }, { "epoch": 0.859504132231405, "grad_norm": 0.2503208599675681, "learning_rate": 3.814284799843307e-05, "loss": 0.5819, "num_tokens": 1794522013.0, "step": 429 }, { "epoch": 0.8615076383671425, "grad_norm": 0.4550089429795267, "learning_rate": 3.813249707539791e-05, "loss": 0.5778, "num_tokens": 1798693517.0, "step": 430 }, { "epoch": 0.86351114450288, "grad_norm": 0.3799715951170476, "learning_rate": 3.812211896478305e-05, "loss": 0.5884, "num_tokens": 1802887821.0, "step": 431 }, { "epoch": 0.8655146506386175, "grad_norm": 0.3081520024730645, "learning_rate": 3.8111713684078376e-05, "loss": 0.5862, "num_tokens": 1807004529.0, "step": 432 }, { "epoch": 0.8675181567743552, "grad_norm": 0.3297104776672864, "learning_rate": 3.810128125081958e-05, "loss": 0.5721, "num_tokens": 1811198833.0, "step": 433 }, { "epoch": 0.8695216629100927, "grad_norm": 0.29906334147999286, "learning_rate": 3.809082168258811e-05, "loss": 0.5956, "num_tokens": 1815349135.0, "step": 434 }, { "epoch": 0.8715251690458302, "grad_norm": 0.2531966387524201, "learning_rate": 3.8080334997011144e-05, "loss": 0.5863, "num_tokens": 1819524345.0, "step": 435 }, { "epoch": 0.8735286751815677, "grad_norm": 0.3405010139521172, "learning_rate": 3.8069821211761554e-05, "loss": 0.5627, "num_tokens": 1823694820.0, "step": 436 }, { "epoch": 0.8755321813173053, "grad_norm": 0.23479022423394907, "learning_rate": 3.805928034455787e-05, "loss": 0.5761, "num_tokens": 1827867685.0, "step": 437 }, { "epoch": 0.8775356874530428, "grad_norm": 0.4118335797577002, "learning_rate": 3.8048712413164294e-05, "loss": 0.5761, "num_tokens": 1832057438.0, "step": 438 }, { "epoch": 0.8795391935887804, "grad_norm": 0.3361006803517675, "learning_rate": 3.803811743539062e-05, "loss": 0.588, "num_tokens": 1836251742.0, "step": 439 }, { "epoch": 0.8815426997245179, "grad_norm": 0.321396394245055, "learning_rate": 3.802749542909223e-05, "loss": 0.5916, "num_tokens": 1840430983.0, "step": 440 }, { "epoch": 0.8835462058602555, "grad_norm": 0.3090474945655915, "learning_rate": 3.801684641217003e-05, "loss": 0.5888, "num_tokens": 1844625287.0, "step": 441 }, { "epoch": 0.885549711995993, "grad_norm": 0.29060670738746924, "learning_rate": 3.800617040257048e-05, "loss": 0.5818, "num_tokens": 1848791538.0, "step": 442 }, { "epoch": 0.8875532181317305, "grad_norm": 0.266025485918288, "learning_rate": 3.799546741828552e-05, "loss": 0.5911, "num_tokens": 1852985842.0, "step": 443 }, { "epoch": 0.8895567242674681, "grad_norm": 0.30275611640669975, "learning_rate": 3.798473747735254e-05, "loss": 0.5779, "num_tokens": 1857180146.0, "step": 444 }, { "epoch": 0.8915602304032056, "grad_norm": 0.19390689129939542, "learning_rate": 3.7973980597854365e-05, "loss": 0.5683, "num_tokens": 1861358506.0, "step": 445 }, { "epoch": 0.8935637365389432, "grad_norm": 0.36206917626866325, "learning_rate": 3.7963196797919224e-05, "loss": 0.5782, "num_tokens": 1865552810.0, "step": 446 }, { "epoch": 0.8955672426746807, "grad_norm": 0.27448577354316955, "learning_rate": 3.79523860957207e-05, "loss": 0.5875, "num_tokens": 1869747114.0, "step": 447 }, { "epoch": 0.8975707488104182, "grad_norm": 0.38803093870203986, "learning_rate": 3.794154850947772e-05, "loss": 0.5872, "num_tokens": 1873894805.0, "step": 448 }, { "epoch": 0.8995742549461557, "grad_norm": 0.39486471156601083, "learning_rate": 3.7930684057454545e-05, "loss": 0.5894, "num_tokens": 1878089109.0, "step": 449 }, { "epoch": 0.9015777610818934, "grad_norm": 0.2633959070344954, "learning_rate": 3.791979275796066e-05, "loss": 0.5786, "num_tokens": 1882277164.0, "step": 450 }, { "epoch": 0.9035812672176309, "grad_norm": 0.3557469874795336, "learning_rate": 3.790887462935084e-05, "loss": 0.5877, "num_tokens": 1886442603.0, "step": 451 }, { "epoch": 0.9055847733533684, "grad_norm": 0.24071028003965983, "learning_rate": 3.7897929690025044e-05, "loss": 0.592, "num_tokens": 1890617134.0, "step": 452 }, { "epoch": 0.9075882794891059, "grad_norm": 0.2553630793713854, "learning_rate": 3.788695795842843e-05, "loss": 0.5841, "num_tokens": 1894811438.0, "step": 453 }, { "epoch": 0.9095917856248434, "grad_norm": 0.24119606174603697, "learning_rate": 3.7875959453051316e-05, "loss": 0.5881, "num_tokens": 1899005742.0, "step": 454 }, { "epoch": 0.911595291760581, "grad_norm": 0.2160300770283946, "learning_rate": 3.786493419242912e-05, "loss": 0.5777, "num_tokens": 1903200046.0, "step": 455 }, { "epoch": 0.9135987978963186, "grad_norm": 0.25865574590718377, "learning_rate": 3.785388219514236e-05, "loss": 0.5796, "num_tokens": 1907378621.0, "step": 456 }, { "epoch": 0.9156023040320561, "grad_norm": 0.2729924515095762, "learning_rate": 3.784280347981662e-05, "loss": 0.5842, "num_tokens": 1911572925.0, "step": 457 }, { "epoch": 0.9176058101677936, "grad_norm": 0.2586883593189898, "learning_rate": 3.783169806512249e-05, "loss": 0.5909, "num_tokens": 1915767229.0, "step": 458 }, { "epoch": 0.9196093163035312, "grad_norm": 0.226039219993987, "learning_rate": 3.782056596977558e-05, "loss": 0.5926, "num_tokens": 1919961533.0, "step": 459 }, { "epoch": 0.9216128224392687, "grad_norm": 0.22833641143374084, "learning_rate": 3.780940721253646e-05, "loss": 0.5834, "num_tokens": 1924115570.0, "step": 460 }, { "epoch": 0.9236163285750063, "grad_norm": 0.23698534742461053, "learning_rate": 3.779822181221061e-05, "loss": 0.5679, "num_tokens": 1928295152.0, "step": 461 }, { "epoch": 0.9256198347107438, "grad_norm": 0.2656047732066002, "learning_rate": 3.778700978764842e-05, "loss": 0.5817, "num_tokens": 1932489456.0, "step": 462 }, { "epoch": 0.9276233408464813, "grad_norm": 0.20016429697608284, "learning_rate": 3.777577115774518e-05, "loss": 0.5713, "num_tokens": 1936683760.0, "step": 463 }, { "epoch": 0.9296268469822189, "grad_norm": 0.3292791999548982, "learning_rate": 3.7764505941440984e-05, "loss": 0.5865, "num_tokens": 1940878064.0, "step": 464 }, { "epoch": 0.9316303531179564, "grad_norm": 0.20432915076312116, "learning_rate": 3.775321415772073e-05, "loss": 0.5789, "num_tokens": 1945072368.0, "step": 465 }, { "epoch": 0.9336338592536939, "grad_norm": 0.32910363483201693, "learning_rate": 3.77418958256141e-05, "loss": 0.5864, "num_tokens": 1949266672.0, "step": 466 }, { "epoch": 0.9356373653894315, "grad_norm": 0.2536855421383259, "learning_rate": 3.7730550964195524e-05, "loss": 0.5691, "num_tokens": 1953460976.0, "step": 467 }, { "epoch": 0.9376408715251691, "grad_norm": 0.31347200026861466, "learning_rate": 3.7719179592584134e-05, "loss": 0.5946, "num_tokens": 1957655280.0, "step": 468 }, { "epoch": 0.9396443776609066, "grad_norm": 0.2864206810788256, "learning_rate": 3.770778172994373e-05, "loss": 0.5822, "num_tokens": 1961801485.0, "step": 469 }, { "epoch": 0.9416478837966441, "grad_norm": 0.27138780338900254, "learning_rate": 3.769635739548276e-05, "loss": 0.5789, "num_tokens": 1965995789.0, "step": 470 }, { "epoch": 0.9436513899323816, "grad_norm": 0.2504539053272057, "learning_rate": 3.7684906608454305e-05, "loss": 0.5933, "num_tokens": 1970167135.0, "step": 471 }, { "epoch": 0.9456548960681193, "grad_norm": 0.2571410163306318, "learning_rate": 3.7673429388155996e-05, "loss": 0.5977, "num_tokens": 1974361439.0, "step": 472 }, { "epoch": 0.9476584022038568, "grad_norm": 0.22647254228124117, "learning_rate": 3.766192575393002e-05, "loss": 0.5797, "num_tokens": 1978555743.0, "step": 473 }, { "epoch": 0.9496619083395943, "grad_norm": 0.23686706447746114, "learning_rate": 3.7650395725163095e-05, "loss": 0.5783, "num_tokens": 1982750047.0, "step": 474 }, { "epoch": 0.9516654144753318, "grad_norm": 0.20950008237570136, "learning_rate": 3.763883932128641e-05, "loss": 0.576, "num_tokens": 1986900656.0, "step": 475 }, { "epoch": 0.9536689206110693, "grad_norm": 0.2570614811450018, "learning_rate": 3.762725656177559e-05, "loss": 0.6022, "num_tokens": 1991089005.0, "step": 476 }, { "epoch": 0.9556724267468069, "grad_norm": 0.2957680248012442, "learning_rate": 3.76156474661507e-05, "loss": 0.5769, "num_tokens": 1995283309.0, "step": 477 }, { "epoch": 0.9576759328825445, "grad_norm": 0.20324918892314317, "learning_rate": 3.760401205397616e-05, "loss": 0.5867, "num_tokens": 1999477613.0, "step": 478 }, { "epoch": 0.959679439018282, "grad_norm": 0.18418469856429942, "learning_rate": 3.759235034486077e-05, "loss": 0.5891, "num_tokens": 2003671917.0, "step": 479 }, { "epoch": 0.9616829451540195, "grad_norm": 0.23011571180084012, "learning_rate": 3.7580662358457635e-05, "loss": 0.5888, "num_tokens": 2007811460.0, "step": 480 }, { "epoch": 0.963686451289757, "grad_norm": 0.24157779117674286, "learning_rate": 3.756894811446413e-05, "loss": 0.5781, "num_tokens": 2011998623.0, "step": 481 }, { "epoch": 0.9656899574254946, "grad_norm": 0.21531720230002013, "learning_rate": 3.755720763262191e-05, "loss": 0.5896, "num_tokens": 2016165383.0, "step": 482 }, { "epoch": 0.9676934635612322, "grad_norm": 0.32743573035090334, "learning_rate": 3.7545440932716826e-05, "loss": 0.5936, "num_tokens": 2020357366.0, "step": 483 }, { "epoch": 0.9696969696969697, "grad_norm": 0.22595391499455295, "learning_rate": 3.753364803457892e-05, "loss": 0.5926, "num_tokens": 2024551670.0, "step": 484 }, { "epoch": 0.9717004758327072, "grad_norm": 0.3637032602854358, "learning_rate": 3.752182895808239e-05, "loss": 0.5873, "num_tokens": 2028745974.0, "step": 485 }, { "epoch": 0.9737039819684448, "grad_norm": 0.28003476642814346, "learning_rate": 3.7509983723145534e-05, "loss": 0.5818, "num_tokens": 2032940278.0, "step": 486 }, { "epoch": 0.9757074881041823, "grad_norm": 0.35063288647099033, "learning_rate": 3.7498112349730755e-05, "loss": 0.5949, "num_tokens": 2037134582.0, "step": 487 }, { "epoch": 0.9777109942399198, "grad_norm": 0.2728046943428182, "learning_rate": 3.7486214857844495e-05, "loss": 0.5831, "num_tokens": 2041309186.0, "step": 488 }, { "epoch": 0.9797145003756574, "grad_norm": 0.3249458070253897, "learning_rate": 3.7474291267537217e-05, "loss": 0.5778, "num_tokens": 2045503490.0, "step": 489 }, { "epoch": 0.981718006511395, "grad_norm": 0.2525858882449858, "learning_rate": 3.746234159890337e-05, "loss": 0.5776, "num_tokens": 2049697794.0, "step": 490 }, { "epoch": 0.9837215126471325, "grad_norm": 0.3240679838800922, "learning_rate": 3.745036587208133e-05, "loss": 0.5869, "num_tokens": 2053864045.0, "step": 491 }, { "epoch": 0.98572501878287, "grad_norm": 0.244006094447954, "learning_rate": 3.743836410725343e-05, "loss": 0.5805, "num_tokens": 2058047180.0, "step": 492 }, { "epoch": 0.9877285249186075, "grad_norm": 0.27014013557517347, "learning_rate": 3.7426336324645844e-05, "loss": 0.5909, "num_tokens": 2062186967.0, "step": 493 }, { "epoch": 0.9897320310543452, "grad_norm": 0.2611075075921653, "learning_rate": 3.7414282544528604e-05, "loss": 0.5846, "num_tokens": 2066369719.0, "step": 494 }, { "epoch": 0.9917355371900827, "grad_norm": 0.22647843061081374, "learning_rate": 3.7402202787215567e-05, "loss": 0.5846, "num_tokens": 2070564023.0, "step": 495 }, { "epoch": 0.9937390433258202, "grad_norm": 0.22645126334621282, "learning_rate": 3.739009707306437e-05, "loss": 0.5702, "num_tokens": 2074735783.0, "step": 496 }, { "epoch": 0.9957425494615577, "grad_norm": 0.28307052714407566, "learning_rate": 3.7377965422476356e-05, "loss": 0.5724, "num_tokens": 2078926248.0, "step": 497 }, { "epoch": 0.9977460555972952, "grad_norm": 0.2175008732146815, "learning_rate": 3.736580785589663e-05, "loss": 0.5786, "num_tokens": 2083120552.0, "step": 498 }, { "epoch": 0.9997495617330328, "grad_norm": 0.3187055654693319, "learning_rate": 3.735362439381395e-05, "loss": 0.5785, "num_tokens": 2087299479.0, "step": 499 }, { "epoch": 1.0, "grad_norm": 0.3187055654693319, "learning_rate": 3.7341415056760697e-05, "loss": 0.589, "num_tokens": 2087823767.0, "step": 500 }, { "epoch": 1.0020035061357375, "grad_norm": 0.4650725478259329, "learning_rate": 3.7329179865312885e-05, "loss": 0.5673, "num_tokens": 2092018071.0, "step": 501 }, { "epoch": 1.004007012271475, "grad_norm": 0.26124397195672483, "learning_rate": 3.7316918840090094e-05, "loss": 0.5445, "num_tokens": 2096212375.0, "step": 502 }, { "epoch": 1.0060105184072126, "grad_norm": 0.3280146883302913, "learning_rate": 3.7304632001755435e-05, "loss": 0.5536, "num_tokens": 2100388008.0, "step": 503 }, { "epoch": 1.00801402454295, "grad_norm": 0.2665342746166771, "learning_rate": 3.72923193710155e-05, "loss": 0.5434, "num_tokens": 2104566148.0, "step": 504 }, { "epoch": 1.0100175306786876, "grad_norm": 0.35632991307670947, "learning_rate": 3.727998096862041e-05, "loss": 0.5686, "num_tokens": 2108760452.0, "step": 505 }, { "epoch": 1.0120210368144253, "grad_norm": 0.28086087616230815, "learning_rate": 3.726761681536365e-05, "loss": 0.5676, "num_tokens": 2112952750.0, "step": 506 }, { "epoch": 1.0140245429501629, "grad_norm": 0.3390152661621213, "learning_rate": 3.725522693208214e-05, "loss": 0.5588, "num_tokens": 2117147054.0, "step": 507 }, { "epoch": 1.0160280490859004, "grad_norm": 0.26047426934751533, "learning_rate": 3.724281133965615e-05, "loss": 0.5428, "num_tokens": 2121341358.0, "step": 508 }, { "epoch": 1.018031555221638, "grad_norm": 0.30905121443781214, "learning_rate": 3.723037005900929e-05, "loss": 0.5683, "num_tokens": 2125507492.0, "step": 509 }, { "epoch": 1.0200350613573754, "grad_norm": 0.2500072495447195, "learning_rate": 3.721790311110843e-05, "loss": 0.571, "num_tokens": 2129697870.0, "step": 510 }, { "epoch": 1.022038567493113, "grad_norm": 0.3856243852707768, "learning_rate": 3.720541051696373e-05, "loss": 0.5713, "num_tokens": 2133874463.0, "step": 511 }, { "epoch": 1.0240420736288505, "grad_norm": 0.30197149190566486, "learning_rate": 3.719289229762856e-05, "loss": 0.5615, "num_tokens": 2138047367.0, "step": 512 }, { "epoch": 1.026045579764588, "grad_norm": 0.31850188849126243, "learning_rate": 3.7180348474199474e-05, "loss": 0.5524, "num_tokens": 2142241671.0, "step": 513 }, { "epoch": 1.0280490859003255, "grad_norm": 0.2534121476821553, "learning_rate": 3.716777906781617e-05, "loss": 0.5551, "num_tokens": 2146435975.0, "step": 514 }, { "epoch": 1.030052592036063, "grad_norm": 0.35547856493117164, "learning_rate": 3.715518409966147e-05, "loss": 0.572, "num_tokens": 2150619518.0, "step": 515 }, { "epoch": 1.0320560981718006, "grad_norm": 0.29200230999251614, "learning_rate": 3.7142563590961265e-05, "loss": 0.5578, "num_tokens": 2154813822.0, "step": 516 }, { "epoch": 1.0340596043075383, "grad_norm": 0.2543001869701751, "learning_rate": 3.712991756298449e-05, "loss": 0.5486, "num_tokens": 2159008126.0, "step": 517 }, { "epoch": 1.0360631104432758, "grad_norm": 0.27299360487073504, "learning_rate": 3.71172460370431e-05, "loss": 0.5562, "num_tokens": 2163202430.0, "step": 518 }, { "epoch": 1.0380666165790133, "grad_norm": 0.20811477648344093, "learning_rate": 3.710454903449201e-05, "loss": 0.5643, "num_tokens": 2167376428.0, "step": 519 }, { "epoch": 1.0400701227147509, "grad_norm": 0.2570778827972109, "learning_rate": 3.709182657672907e-05, "loss": 0.5529, "num_tokens": 2171570732.0, "step": 520 }, { "epoch": 1.0420736288504884, "grad_norm": 0.21352908993345981, "learning_rate": 3.7079078685195026e-05, "loss": 0.5581, "num_tokens": 2175765036.0, "step": 521 }, { "epoch": 1.044077134986226, "grad_norm": 0.285546103607232, "learning_rate": 3.706630538137348e-05, "loss": 0.5624, "num_tokens": 2179959340.0, "step": 522 }, { "epoch": 1.0460806411219634, "grad_norm": 0.18888002382466376, "learning_rate": 3.7053506686790905e-05, "loss": 0.5459, "num_tokens": 2184153644.0, "step": 523 }, { "epoch": 1.048084147257701, "grad_norm": 0.2629456056451125, "learning_rate": 3.7040682623016494e-05, "loss": 0.5677, "num_tokens": 2188324523.0, "step": 524 }, { "epoch": 1.0500876533934385, "grad_norm": 0.25708386140071166, "learning_rate": 3.7027833211662246e-05, "loss": 0.5584, "num_tokens": 2192518827.0, "step": 525 }, { "epoch": 1.052091159529176, "grad_norm": 0.22378039816551756, "learning_rate": 3.7014958474382864e-05, "loss": 0.5623, "num_tokens": 2196706436.0, "step": 526 }, { "epoch": 1.0540946656649135, "grad_norm": 0.2819435964020685, "learning_rate": 3.700205843287572e-05, "loss": 0.5733, "num_tokens": 2200864112.0, "step": 527 }, { "epoch": 1.0560981718006512, "grad_norm": 0.26748689234177603, "learning_rate": 3.698913310888085e-05, "loss": 0.5511, "num_tokens": 2205038839.0, "step": 528 }, { "epoch": 1.0581016779363888, "grad_norm": 0.2353352374280479, "learning_rate": 3.697618252418088e-05, "loss": 0.5424, "num_tokens": 2209232277.0, "step": 529 }, { "epoch": 1.0601051840721263, "grad_norm": 0.2935590544400099, "learning_rate": 3.696320670060103e-05, "loss": 0.5662, "num_tokens": 2213420732.0, "step": 530 }, { "epoch": 1.0621086902078638, "grad_norm": 0.20466877327907884, "learning_rate": 3.695020566000902e-05, "loss": 0.5673, "num_tokens": 2217615036.0, "step": 531 }, { "epoch": 1.0641121963436013, "grad_norm": 0.4014358366260843, "learning_rate": 3.6937179424315095e-05, "loss": 0.5705, "num_tokens": 2221809340.0, "step": 532 }, { "epoch": 1.0661157024793388, "grad_norm": 0.36933547297745645, "learning_rate": 3.6924128015471956e-05, "loss": 0.5608, "num_tokens": 2226000960.0, "step": 533 }, { "epoch": 1.0681192086150764, "grad_norm": 0.2938752570288982, "learning_rate": 3.6911051455474714e-05, "loss": 0.5543, "num_tokens": 2230184118.0, "step": 534 }, { "epoch": 1.0701227147508139, "grad_norm": 0.31833544606848524, "learning_rate": 3.6897949766360884e-05, "loss": 0.5711, "num_tokens": 2234378422.0, "step": 535 }, { "epoch": 1.0721262208865514, "grad_norm": 0.24138932254516426, "learning_rate": 3.688482297021032e-05, "loss": 0.5477, "num_tokens": 2238572726.0, "step": 536 }, { "epoch": 1.074129727022289, "grad_norm": 0.23727890638168075, "learning_rate": 3.687167108914519e-05, "loss": 0.5483, "num_tokens": 2242743690.0, "step": 537 }, { "epoch": 1.0761332331580264, "grad_norm": 0.2870858488077604, "learning_rate": 3.685849414532993e-05, "loss": 0.56, "num_tokens": 2246937994.0, "step": 538 }, { "epoch": 1.0781367392937642, "grad_norm": 0.16474873130165163, "learning_rate": 3.6845292160971235e-05, "loss": 0.5534, "num_tokens": 2251132298.0, "step": 539 }, { "epoch": 1.0801402454295017, "grad_norm": 0.3174458461337024, "learning_rate": 3.683206515831798e-05, "loss": 0.5665, "num_tokens": 2255326602.0, "step": 540 }, { "epoch": 1.0821437515652392, "grad_norm": 0.21870041023873518, "learning_rate": 3.681881315966121e-05, "loss": 0.5609, "num_tokens": 2259499742.0, "step": 541 }, { "epoch": 1.0841472577009768, "grad_norm": 0.30490353818951693, "learning_rate": 3.680553618733408e-05, "loss": 0.5729, "num_tokens": 2263660645.0, "step": 542 }, { "epoch": 1.0861507638367143, "grad_norm": 0.2422886773088014, "learning_rate": 3.679223426371186e-05, "loss": 0.5562, "num_tokens": 2267830260.0, "step": 543 }, { "epoch": 1.0881542699724518, "grad_norm": 0.2757422950079848, "learning_rate": 3.677890741121185e-05, "loss": 0.5441, "num_tokens": 2272024564.0, "step": 544 }, { "epoch": 1.0901577761081893, "grad_norm": 0.2385073279402522, "learning_rate": 3.676555565229336e-05, "loss": 0.5573, "num_tokens": 2276209980.0, "step": 545 }, { "epoch": 1.0921612822439268, "grad_norm": 0.3148207236490165, "learning_rate": 3.675217900945769e-05, "loss": 0.5704, "num_tokens": 2280401879.0, "step": 546 }, { "epoch": 1.0941647883796644, "grad_norm": 0.22407437742538722, "learning_rate": 3.673877750524807e-05, "loss": 0.5539, "num_tokens": 2284571471.0, "step": 547 }, { "epoch": 1.0961682945154019, "grad_norm": 0.34324537938473154, "learning_rate": 3.672535116224963e-05, "loss": 0.5612, "num_tokens": 2288751196.0, "step": 548 }, { "epoch": 1.0981718006511394, "grad_norm": 0.2623862759259999, "learning_rate": 3.671190000308932e-05, "loss": 0.5585, "num_tokens": 2292933742.0, "step": 549 }, { "epoch": 1.1001753067868771, "grad_norm": 0.346501926221328, "learning_rate": 3.6698424050435994e-05, "loss": 0.5672, "num_tokens": 2297100236.0, "step": 550 }, { "epoch": 1.1021788129226147, "grad_norm": 0.2646451995911233, "learning_rate": 3.6684923327000215e-05, "loss": 0.5574, "num_tokens": 2301288451.0, "step": 551 }, { "epoch": 1.1041823190583522, "grad_norm": 0.3017159306119006, "learning_rate": 3.667139785553432e-05, "loss": 0.5739, "num_tokens": 2305478283.0, "step": 552 }, { "epoch": 1.1061858251940897, "grad_norm": 0.28387334101442246, "learning_rate": 3.665784765883236e-05, "loss": 0.5549, "num_tokens": 2309672587.0, "step": 553 }, { "epoch": 1.1081893313298272, "grad_norm": 0.23396028953108422, "learning_rate": 3.664427275973005e-05, "loss": 0.5574, "num_tokens": 2313863861.0, "step": 554 }, { "epoch": 1.1101928374655647, "grad_norm": 0.28247959580993476, "learning_rate": 3.663067318110471e-05, "loss": 0.5613, "num_tokens": 2318055008.0, "step": 555 }, { "epoch": 1.1121963436013023, "grad_norm": 0.1921984735177737, "learning_rate": 3.6617048945875305e-05, "loss": 0.5598, "num_tokens": 2322249312.0, "step": 556 }, { "epoch": 1.1141998497370398, "grad_norm": 0.25086300823447066, "learning_rate": 3.66034000770023e-05, "loss": 0.5704, "num_tokens": 2326443616.0, "step": 557 }, { "epoch": 1.1162033558727773, "grad_norm": 0.21362780467903317, "learning_rate": 3.6589726597487694e-05, "loss": 0.56, "num_tokens": 2330608463.0, "step": 558 }, { "epoch": 1.1182068620085148, "grad_norm": 0.19831705154357013, "learning_rate": 3.657602853037498e-05, "loss": 0.5544, "num_tokens": 2334784290.0, "step": 559 }, { "epoch": 1.1202103681442523, "grad_norm": 0.25136447821918223, "learning_rate": 3.656230589874905e-05, "loss": 0.5438, "num_tokens": 2338972312.0, "step": 560 }, { "epoch": 1.12221387427999, "grad_norm": 0.18785447168523198, "learning_rate": 3.654855872573624e-05, "loss": 0.5528, "num_tokens": 2343149034.0, "step": 561 }, { "epoch": 1.1242173804157276, "grad_norm": 0.20030206882578802, "learning_rate": 3.6534787034504205e-05, "loss": 0.5515, "num_tokens": 2347340476.0, "step": 562 }, { "epoch": 1.1262208865514651, "grad_norm": 0.17425211022823603, "learning_rate": 3.652099084826193e-05, "loss": 0.557, "num_tokens": 2351514614.0, "step": 563 }, { "epoch": 1.1282243926872026, "grad_norm": 0.20421288927229664, "learning_rate": 3.6507170190259686e-05, "loss": 0.5594, "num_tokens": 2355708918.0, "step": 564 }, { "epoch": 1.1302278988229402, "grad_norm": 0.20072755230843967, "learning_rate": 3.6493325083788994e-05, "loss": 0.5583, "num_tokens": 2359903222.0, "step": 565 }, { "epoch": 1.1322314049586777, "grad_norm": 0.20947932367405453, "learning_rate": 3.6479455552182566e-05, "loss": 0.5698, "num_tokens": 2364087862.0, "step": 566 }, { "epoch": 1.1342349110944152, "grad_norm": 0.23946802962721112, "learning_rate": 3.646556161881427e-05, "loss": 0.5546, "num_tokens": 2368279132.0, "step": 567 }, { "epoch": 1.1362384172301527, "grad_norm": 0.24479840860388014, "learning_rate": 3.645164330709911e-05, "loss": 0.5316, "num_tokens": 2372449230.0, "step": 568 }, { "epoch": 1.1382419233658903, "grad_norm": 0.22433943867805906, "learning_rate": 3.643770064049318e-05, "loss": 0.5478, "num_tokens": 2376643534.0, "step": 569 }, { "epoch": 1.1402454295016278, "grad_norm": 0.17454395275703066, "learning_rate": 3.64237336424936e-05, "loss": 0.5643, "num_tokens": 2380837838.0, "step": 570 }, { "epoch": 1.1422489356373653, "grad_norm": 0.1962142342056278, "learning_rate": 3.640974233663849e-05, "loss": 0.5524, "num_tokens": 2385032142.0, "step": 571 }, { "epoch": 1.1442524417731028, "grad_norm": 0.18877231634658428, "learning_rate": 3.6395726746506974e-05, "loss": 0.5686, "num_tokens": 2389226446.0, "step": 572 }, { "epoch": 1.1462559479088406, "grad_norm": 0.2731164849326947, "learning_rate": 3.638168689571906e-05, "loss": 0.5407, "num_tokens": 2393400587.0, "step": 573 }, { "epoch": 1.148259454044578, "grad_norm": 0.1977504217768025, "learning_rate": 3.636762280793566e-05, "loss": 0.5417, "num_tokens": 2397562501.0, "step": 574 }, { "epoch": 1.1502629601803156, "grad_norm": 0.20376337610228354, "learning_rate": 3.635353450685854e-05, "loss": 0.5576, "num_tokens": 2401756805.0, "step": 575 }, { "epoch": 1.1522664663160531, "grad_norm": 0.2581762084079028, "learning_rate": 3.633942201623025e-05, "loss": 0.5652, "num_tokens": 2405951109.0, "step": 576 }, { "epoch": 1.1542699724517906, "grad_norm": 0.2791787852103277, "learning_rate": 3.6325285359834126e-05, "loss": 0.5549, "num_tokens": 2410145413.0, "step": 577 }, { "epoch": 1.1562734785875282, "grad_norm": 0.20912795783400145, "learning_rate": 3.6311124561494215e-05, "loss": 0.5615, "num_tokens": 2414339717.0, "step": 578 }, { "epoch": 1.1582769847232657, "grad_norm": 0.28362660551925134, "learning_rate": 3.629693964507526e-05, "loss": 0.5523, "num_tokens": 2418480265.0, "step": 579 }, { "epoch": 1.1602804908590032, "grad_norm": 0.27957055011489507, "learning_rate": 3.6282730634482646e-05, "loss": 0.5565, "num_tokens": 2422674569.0, "step": 580 }, { "epoch": 1.1622839969947407, "grad_norm": 0.1977385247152695, "learning_rate": 3.626849755366236e-05, "loss": 0.5498, "num_tokens": 2426853496.0, "step": 581 }, { "epoch": 1.1642875031304782, "grad_norm": 0.3533290439409778, "learning_rate": 3.625424042660095e-05, "loss": 0.556, "num_tokens": 2431047800.0, "step": 582 }, { "epoch": 1.166291009266216, "grad_norm": 0.23030697177846152, "learning_rate": 3.623995927732551e-05, "loss": 0.548, "num_tokens": 2435242104.0, "step": 583 }, { "epoch": 1.1682945154019535, "grad_norm": 0.43544233740944266, "learning_rate": 3.62256541299036e-05, "loss": 0.5713, "num_tokens": 2439423854.0, "step": 584 }, { "epoch": 1.170298021537691, "grad_norm": 0.3458220272618275, "learning_rate": 3.621132500844321e-05, "loss": 0.5814, "num_tokens": 2443599759.0, "step": 585 }, { "epoch": 1.1723015276734285, "grad_norm": 0.44155428231888066, "learning_rate": 3.619697193709277e-05, "loss": 0.5702, "num_tokens": 2447794063.0, "step": 586 }, { "epoch": 1.174305033809166, "grad_norm": 0.3544958459853805, "learning_rate": 3.6182594940041026e-05, "loss": 0.556, "num_tokens": 2451975292.0, "step": 587 }, { "epoch": 1.1763085399449036, "grad_norm": 0.4606628693782234, "learning_rate": 3.616819404151709e-05, "loss": 0.5655, "num_tokens": 2456114776.0, "step": 588 }, { "epoch": 1.178312046080641, "grad_norm": 0.4511796354328812, "learning_rate": 3.6153769265790334e-05, "loss": 0.5576, "num_tokens": 2460300224.0, "step": 589 }, { "epoch": 1.1803155522163786, "grad_norm": 0.28678664185086555, "learning_rate": 3.613932063717036e-05, "loss": 0.5473, "num_tokens": 2464482820.0, "step": 590 }, { "epoch": 1.1823190583521161, "grad_norm": 0.3403509020169062, "learning_rate": 3.612484818000698e-05, "loss": 0.5457, "num_tokens": 2468677124.0, "step": 591 }, { "epoch": 1.1843225644878537, "grad_norm": 0.2893225453113179, "learning_rate": 3.6110351918690164e-05, "loss": 0.5466, "num_tokens": 2472871428.0, "step": 592 }, { "epoch": 1.1863260706235912, "grad_norm": 0.2598830335217951, "learning_rate": 3.609583187765e-05, "loss": 0.5607, "num_tokens": 2477065732.0, "step": 593 }, { "epoch": 1.1883295767593287, "grad_norm": 0.30922351784458885, "learning_rate": 3.608128808135664e-05, "loss": 0.5386, "num_tokens": 2481211074.0, "step": 594 }, { "epoch": 1.1903330828950665, "grad_norm": 0.22319618744027403, "learning_rate": 3.606672055432028e-05, "loss": 0.5421, "num_tokens": 2485378136.0, "step": 595 }, { "epoch": 1.192336589030804, "grad_norm": 0.32807341502404547, "learning_rate": 3.605212932109111e-05, "loss": 0.5499, "num_tokens": 2489572440.0, "step": 596 }, { "epoch": 1.1943400951665415, "grad_norm": 0.22336677965204674, "learning_rate": 3.6037514406259255e-05, "loss": 0.5483, "num_tokens": 2493766744.0, "step": 597 }, { "epoch": 1.196343601302279, "grad_norm": 0.3535913795252647, "learning_rate": 3.602287583445478e-05, "loss": 0.5556, "num_tokens": 2497961048.0, "step": 598 }, { "epoch": 1.1983471074380165, "grad_norm": 0.2737317677307051, "learning_rate": 3.600821363034758e-05, "loss": 0.56, "num_tokens": 2502096172.0, "step": 599 }, { "epoch": 1.200350613573754, "grad_norm": 0.33340525124477877, "learning_rate": 3.599352781864742e-05, "loss": 0.5542, "num_tokens": 2506290476.0, "step": 600 }, { "epoch": 1.2023541197094916, "grad_norm": 0.32456781833679355, "learning_rate": 3.597881842410381e-05, "loss": 0.5578, "num_tokens": 2510484780.0, "step": 601 }, { "epoch": 1.204357625845229, "grad_norm": 0.32173822462223683, "learning_rate": 3.596408547150603e-05, "loss": 0.5566, "num_tokens": 2514634079.0, "step": 602 }, { "epoch": 1.2063611319809666, "grad_norm": 0.2909275269786899, "learning_rate": 3.594932898568304e-05, "loss": 0.5578, "num_tokens": 2518801897.0, "step": 603 }, { "epoch": 1.2083646381167041, "grad_norm": 0.2996016060779246, "learning_rate": 3.593454899150349e-05, "loss": 0.5513, "num_tokens": 2522996201.0, "step": 604 }, { "epoch": 1.2103681442524419, "grad_norm": 0.27287961638594854, "learning_rate": 3.5919745513875615e-05, "loss": 0.5591, "num_tokens": 2527133482.0, "step": 605 }, { "epoch": 1.2123716503881794, "grad_norm": 0.2651921931845752, "learning_rate": 3.5904918577747246e-05, "loss": 0.5449, "num_tokens": 2531327786.0, "step": 606 }, { "epoch": 1.214375156523917, "grad_norm": 0.28777772283291925, "learning_rate": 3.589006820810574e-05, "loss": 0.5516, "num_tokens": 2535491932.0, "step": 607 }, { "epoch": 1.2163786626596544, "grad_norm": 0.24422774169085082, "learning_rate": 3.5875194429977955e-05, "loss": 0.5454, "num_tokens": 2539674446.0, "step": 608 }, { "epoch": 1.218382168795392, "grad_norm": 0.2487876333982406, "learning_rate": 3.586029726843019e-05, "loss": 0.5423, "num_tokens": 2543819178.0, "step": 609 }, { "epoch": 1.2203856749311295, "grad_norm": 0.2131085555942081, "learning_rate": 3.5845376748568154e-05, "loss": 0.5457, "num_tokens": 2547979297.0, "step": 610 }, { "epoch": 1.222389181066867, "grad_norm": 0.26104546841935616, "learning_rate": 3.583043289553691e-05, "loss": 0.5606, "num_tokens": 2552173601.0, "step": 611 }, { "epoch": 1.2243926872026045, "grad_norm": 0.26294226097421697, "learning_rate": 3.5815465734520863e-05, "loss": 0.5471, "num_tokens": 2556331563.0, "step": 612 }, { "epoch": 1.226396193338342, "grad_norm": 0.24693973953316511, "learning_rate": 3.580047529074369e-05, "loss": 0.5627, "num_tokens": 2560509426.0, "step": 613 }, { "epoch": 1.2283996994740796, "grad_norm": 0.269442151857774, "learning_rate": 3.578546158946831e-05, "loss": 0.5554, "num_tokens": 2564703730.0, "step": 614 }, { "epoch": 1.230403205609817, "grad_norm": 0.20112836768343784, "learning_rate": 3.577042465599682e-05, "loss": 0.5569, "num_tokens": 2568869171.0, "step": 615 }, { "epoch": 1.2324067117455546, "grad_norm": 0.24309767543355018, "learning_rate": 3.575536451567049e-05, "loss": 0.5586, "num_tokens": 2573063475.0, "step": 616 }, { "epoch": 1.2344102178812923, "grad_norm": 0.20024087874337437, "learning_rate": 3.574028119386968e-05, "loss": 0.5612, "num_tokens": 2577230162.0, "step": 617 }, { "epoch": 1.2364137240170299, "grad_norm": 0.251286757050521, "learning_rate": 3.572517471601385e-05, "loss": 0.5523, "num_tokens": 2581424466.0, "step": 618 }, { "epoch": 1.2384172301527674, "grad_norm": 0.18046873691822213, "learning_rate": 3.571004510756144e-05, "loss": 0.5612, "num_tokens": 2585608771.0, "step": 619 }, { "epoch": 1.240420736288505, "grad_norm": 0.1981977448933834, "learning_rate": 3.56948923940099e-05, "loss": 0.5626, "num_tokens": 2589803075.0, "step": 620 }, { "epoch": 1.2424242424242424, "grad_norm": 0.1733579565455049, "learning_rate": 3.567971660089562e-05, "loss": 0.5561, "num_tokens": 2593983100.0, "step": 621 }, { "epoch": 1.24442774855998, "grad_norm": 0.19174261108690743, "learning_rate": 3.566451775379387e-05, "loss": 0.5402, "num_tokens": 2598177404.0, "step": 622 }, { "epoch": 1.2464312546957175, "grad_norm": 0.1648610321757225, "learning_rate": 3.564929587831878e-05, "loss": 0.5451, "num_tokens": 2602332936.0, "step": 623 }, { "epoch": 1.248434760831455, "grad_norm": 0.18990023513322232, "learning_rate": 3.563405100012329e-05, "loss": 0.5432, "num_tokens": 2606520941.0, "step": 624 }, { "epoch": 1.2504382669671925, "grad_norm": 0.22251340829066305, "learning_rate": 3.5618783144899106e-05, "loss": 0.55, "num_tokens": 2610715245.0, "step": 625 }, { "epoch": 1.2524417731029303, "grad_norm": 0.149240916403334, "learning_rate": 3.5603492338376656e-05, "loss": 0.5645, "num_tokens": 2614884936.0, "step": 626 }, { "epoch": 1.2544452792386678, "grad_norm": 0.1663909526708326, "learning_rate": 3.5588178606325034e-05, "loss": 0.5547, "num_tokens": 2619064906.0, "step": 627 }, { "epoch": 1.2564487853744053, "grad_norm": 0.19446009118940932, "learning_rate": 3.557284197455199e-05, "loss": 0.5639, "num_tokens": 2623216699.0, "step": 628 }, { "epoch": 1.2584522915101428, "grad_norm": 0.1693440483108325, "learning_rate": 3.5557482468903864e-05, "loss": 0.5602, "num_tokens": 2627334730.0, "step": 629 }, { "epoch": 1.2604557976458803, "grad_norm": 0.15440873806197214, "learning_rate": 3.554210011526552e-05, "loss": 0.5508, "num_tokens": 2631513594.0, "step": 630 }, { "epoch": 1.2624593037816179, "grad_norm": 0.17136107339026518, "learning_rate": 3.5526694939560357e-05, "loss": 0.5502, "num_tokens": 2635707898.0, "step": 631 }, { "epoch": 1.2644628099173554, "grad_norm": 0.21512384865603376, "learning_rate": 3.5511266967750227e-05, "loss": 0.5558, "num_tokens": 2639898486.0, "step": 632 }, { "epoch": 1.266466316053093, "grad_norm": 0.24907037685945513, "learning_rate": 3.5495816225835386e-05, "loss": 0.5568, "num_tokens": 2644069299.0, "step": 633 }, { "epoch": 1.2684698221888304, "grad_norm": 0.14417282300810194, "learning_rate": 3.548034273985448e-05, "loss": 0.5606, "num_tokens": 2648263603.0, "step": 634 }, { "epoch": 1.270473328324568, "grad_norm": 0.23087611574821706, "learning_rate": 3.546484653588448e-05, "loss": 0.5616, "num_tokens": 2652457907.0, "step": 635 }, { "epoch": 1.2724768344603055, "grad_norm": 0.15669213568626317, "learning_rate": 3.544932764004064e-05, "loss": 0.5513, "num_tokens": 2656652211.0, "step": 636 }, { "epoch": 1.274480340596043, "grad_norm": 0.19641575401956776, "learning_rate": 3.543378607847645e-05, "loss": 0.5503, "num_tokens": 2660846515.0, "step": 637 }, { "epoch": 1.2764838467317805, "grad_norm": 0.17669605663294016, "learning_rate": 3.541822187738361e-05, "loss": 0.5444, "num_tokens": 2665040819.0, "step": 638 }, { "epoch": 1.278487352867518, "grad_norm": 0.20221038451337311, "learning_rate": 3.540263506299197e-05, "loss": 0.5485, "num_tokens": 2669232802.0, "step": 639 }, { "epoch": 1.2804908590032558, "grad_norm": 0.19919590572907384, "learning_rate": 3.5387025661569486e-05, "loss": 0.5549, "num_tokens": 2673405488.0, "step": 640 }, { "epoch": 1.2824943651389933, "grad_norm": 0.2779045329935449, "learning_rate": 3.537139369942219e-05, "loss": 0.5616, "num_tokens": 2677599792.0, "step": 641 }, { "epoch": 1.2844978712747308, "grad_norm": 0.1810300990498648, "learning_rate": 3.5355739202894116e-05, "loss": 0.5509, "num_tokens": 2681794096.0, "step": 642 }, { "epoch": 1.2865013774104683, "grad_norm": 0.2512806886250312, "learning_rate": 3.534006219836728e-05, "loss": 0.5553, "num_tokens": 2685959095.0, "step": 643 }, { "epoch": 1.2885048835462058, "grad_norm": 0.3107393745612657, "learning_rate": 3.5324362712261656e-05, "loss": 0.5425, "num_tokens": 2690153399.0, "step": 644 }, { "epoch": 1.2905083896819434, "grad_norm": 0.21212034061853724, "learning_rate": 3.5308640771035056e-05, "loss": 0.5437, "num_tokens": 2694347703.0, "step": 645 }, { "epoch": 1.292511895817681, "grad_norm": 0.37873084220223424, "learning_rate": 3.5292896401183194e-05, "loss": 0.5511, "num_tokens": 2698542007.0, "step": 646 }, { "epoch": 1.2945154019534184, "grad_norm": 0.2981558857869348, "learning_rate": 3.527712962923952e-05, "loss": 0.5406, "num_tokens": 2702736311.0, "step": 647 }, { "epoch": 1.2965189080891562, "grad_norm": 0.30148128093548565, "learning_rate": 3.526134048177529e-05, "loss": 0.553, "num_tokens": 2706930615.0, "step": 648 }, { "epoch": 1.2985224142248937, "grad_norm": 0.24370478400465007, "learning_rate": 3.524552898539944e-05, "loss": 0.5437, "num_tokens": 2711101551.0, "step": 649 }, { "epoch": 1.3005259203606312, "grad_norm": 0.28186794957000033, "learning_rate": 3.522969516675858e-05, "loss": 0.5426, "num_tokens": 2715275027.0, "step": 650 }, { "epoch": 1.3025294264963687, "grad_norm": 0.2079125646465861, "learning_rate": 3.521383905253694e-05, "loss": 0.5455, "num_tokens": 2719440817.0, "step": 651 }, { "epoch": 1.3045329326321062, "grad_norm": 0.30629674338366036, "learning_rate": 3.519796066945631e-05, "loss": 0.549, "num_tokens": 2723635121.0, "step": 652 }, { "epoch": 1.3065364387678438, "grad_norm": 0.18650763637785825, "learning_rate": 3.5182060044276025e-05, "loss": 0.552, "num_tokens": 2727816246.0, "step": 653 }, { "epoch": 1.3085399449035813, "grad_norm": 0.3671676675475379, "learning_rate": 3.5166137203792905e-05, "loss": 0.5459, "num_tokens": 2732010550.0, "step": 654 }, { "epoch": 1.3105434510393188, "grad_norm": 0.2682136496739711, "learning_rate": 3.515019217484119e-05, "loss": 0.56, "num_tokens": 2736204854.0, "step": 655 }, { "epoch": 1.3125469571750563, "grad_norm": 0.34128291717922105, "learning_rate": 3.513422498429254e-05, "loss": 0.56, "num_tokens": 2740399158.0, "step": 656 }, { "epoch": 1.3145504633107938, "grad_norm": 0.32239273307936706, "learning_rate": 3.511823565905594e-05, "loss": 0.5556, "num_tokens": 2744573322.0, "step": 657 }, { "epoch": 1.3165539694465314, "grad_norm": 0.30175922444683756, "learning_rate": 3.5102224226077684e-05, "loss": 0.5484, "num_tokens": 2748767626.0, "step": 658 }, { "epoch": 1.3185574755822689, "grad_norm": 1.3736705179286772, "learning_rate": 3.508619071234133e-05, "loss": 0.5548, "num_tokens": 2752961930.0, "step": 659 }, { "epoch": 1.3205609817180064, "grad_norm": 0.4855415589059751, "learning_rate": 3.507013514486764e-05, "loss": 0.5516, "num_tokens": 2757130135.0, "step": 660 }, { "epoch": 1.322564487853744, "grad_norm": 0.21667512603921624, "learning_rate": 3.505405755071456e-05, "loss": 0.5533, "num_tokens": 2761312445.0, "step": 661 }, { "epoch": 1.3245679939894817, "grad_norm": 0.4931560357470961, "learning_rate": 3.5037957956977134e-05, "loss": 0.548, "num_tokens": 2765475760.0, "step": 662 }, { "epoch": 1.3265715001252192, "grad_norm": 0.5176093204155237, "learning_rate": 3.502183639078748e-05, "loss": 0.5724, "num_tokens": 2769666085.0, "step": 663 }, { "epoch": 1.3285750062609567, "grad_norm": 0.2402963759176084, "learning_rate": 3.500569287931478e-05, "loss": 0.5627, "num_tokens": 2773860389.0, "step": 664 }, { "epoch": 1.3305785123966942, "grad_norm": 0.3471986234908786, "learning_rate": 3.498952744976515e-05, "loss": 0.5543, "num_tokens": 2778005249.0, "step": 665 }, { "epoch": 1.3325820185324317, "grad_norm": 0.35097776278778764, "learning_rate": 3.4973340129381696e-05, "loss": 0.5534, "num_tokens": 2782181421.0, "step": 666 }, { "epoch": 1.3345855246681693, "grad_norm": 0.2357442604803629, "learning_rate": 3.495713094544438e-05, "loss": 0.5581, "num_tokens": 2786347085.0, "step": 667 }, { "epoch": 1.3365890308039068, "grad_norm": 0.3322813100260744, "learning_rate": 3.4940899925270006e-05, "loss": 0.5506, "num_tokens": 2790510191.0, "step": 668 }, { "epoch": 1.3385925369396443, "grad_norm": 0.2611435141196737, "learning_rate": 3.4924647096212205e-05, "loss": 0.5515, "num_tokens": 2794704495.0, "step": 669 }, { "epoch": 1.340596043075382, "grad_norm": 0.299396141118144, "learning_rate": 3.490837248566135e-05, "loss": 0.5576, "num_tokens": 2798873159.0, "step": 670 }, { "epoch": 1.3425995492111196, "grad_norm": 0.2953289806127257, "learning_rate": 3.489207612104451e-05, "loss": 0.5616, "num_tokens": 2803056241.0, "step": 671 }, { "epoch": 1.344603055346857, "grad_norm": 0.2703229975828057, "learning_rate": 3.4875758029825434e-05, "loss": 0.5592, "num_tokens": 2807207028.0, "step": 672 }, { "epoch": 1.3466065614825946, "grad_norm": 0.33131474555690155, "learning_rate": 3.4859418239504464e-05, "loss": 0.5551, "num_tokens": 2811391065.0, "step": 673 }, { "epoch": 1.3486100676183321, "grad_norm": 0.34082577393914465, "learning_rate": 3.484305677761854e-05, "loss": 0.5506, "num_tokens": 2815585369.0, "step": 674 }, { "epoch": 1.3506135737540697, "grad_norm": 0.2556198341250632, "learning_rate": 3.4826673671741115e-05, "loss": 0.5465, "num_tokens": 2819766155.0, "step": 675 }, { "epoch": 1.3526170798898072, "grad_norm": 0.32582201486196216, "learning_rate": 3.4810268949482084e-05, "loss": 0.5344, "num_tokens": 2823960459.0, "step": 676 }, { "epoch": 1.3546205860255447, "grad_norm": 0.21539386831861423, "learning_rate": 3.4793842638487825e-05, "loss": 0.5645, "num_tokens": 2828154763.0, "step": 677 }, { "epoch": 1.3566240921612822, "grad_norm": 0.30655967026099445, "learning_rate": 3.477739476644106e-05, "loss": 0.543, "num_tokens": 2832343715.0, "step": 678 }, { "epoch": 1.3586275982970197, "grad_norm": 0.2183471704074474, "learning_rate": 3.476092536106086e-05, "loss": 0.5547, "num_tokens": 2836517480.0, "step": 679 }, { "epoch": 1.3606311044327573, "grad_norm": 0.3045082626652177, "learning_rate": 3.4744434450102593e-05, "loss": 0.5752, "num_tokens": 2840693209.0, "step": 680 }, { "epoch": 1.3626346105684948, "grad_norm": 0.21182324132948557, "learning_rate": 3.472792206135786e-05, "loss": 0.5464, "num_tokens": 2844881397.0, "step": 681 }, { "epoch": 1.3646381167042323, "grad_norm": 0.30616433861566467, "learning_rate": 3.471138822265445e-05, "loss": 0.5388, "num_tokens": 2849044368.0, "step": 682 }, { "epoch": 1.3666416228399698, "grad_norm": 0.2455342053120656, "learning_rate": 3.4694832961856326e-05, "loss": 0.5516, "num_tokens": 2853235006.0, "step": 683 }, { "epoch": 1.3686451289757076, "grad_norm": 0.27267209904488293, "learning_rate": 3.4678256306863526e-05, "loss": 0.5616, "num_tokens": 2857429310.0, "step": 684 }, { "epoch": 1.370648635111445, "grad_norm": 0.23191268141375257, "learning_rate": 3.4661658285612157e-05, "loss": 0.547, "num_tokens": 2861583556.0, "step": 685 }, { "epoch": 1.3726521412471826, "grad_norm": 0.22449409694796932, "learning_rate": 3.464503892607434e-05, "loss": 0.5381, "num_tokens": 2865777860.0, "step": 686 }, { "epoch": 1.3746556473829201, "grad_norm": 0.2057245970683048, "learning_rate": 3.462839825625813e-05, "loss": 0.5537, "num_tokens": 2869972164.0, "step": 687 }, { "epoch": 1.3766591535186576, "grad_norm": 0.19882957924241743, "learning_rate": 3.461173630420754e-05, "loss": 0.537, "num_tokens": 2874166468.0, "step": 688 }, { "epoch": 1.3786626596543952, "grad_norm": 0.22277099297890252, "learning_rate": 3.459505309800241e-05, "loss": 0.556, "num_tokens": 2878360772.0, "step": 689 }, { "epoch": 1.3806661657901327, "grad_norm": 0.19022585298500969, "learning_rate": 3.45783486657584e-05, "loss": 0.556, "num_tokens": 2882555076.0, "step": 690 }, { "epoch": 1.3826696719258702, "grad_norm": 0.19650866121462704, "learning_rate": 3.456162303562697e-05, "loss": 0.5417, "num_tokens": 2886717876.0, "step": 691 }, { "epoch": 1.384673178061608, "grad_norm": 0.24027116454063047, "learning_rate": 3.454487623579529e-05, "loss": 0.5521, "num_tokens": 2890912180.0, "step": 692 }, { "epoch": 1.3866766841973455, "grad_norm": 0.1849806361064128, "learning_rate": 3.4528108294486184e-05, "loss": 0.5571, "num_tokens": 2895091056.0, "step": 693 }, { "epoch": 1.388680190333083, "grad_norm": 0.2529344295649574, "learning_rate": 3.451131923995815e-05, "loss": 0.5682, "num_tokens": 2899285360.0, "step": 694 }, { "epoch": 1.3906836964688205, "grad_norm": 0.19599495128042138, "learning_rate": 3.449450910050523e-05, "loss": 0.5529, "num_tokens": 2903463262.0, "step": 695 }, { "epoch": 1.392687202604558, "grad_norm": 0.24919743435642505, "learning_rate": 3.4477677904457004e-05, "loss": 0.5483, "num_tokens": 2907657566.0, "step": 696 }, { "epoch": 1.3946907087402955, "grad_norm": 0.20058362064314816, "learning_rate": 3.4460825680178565e-05, "loss": 0.5668, "num_tokens": 2911851870.0, "step": 697 }, { "epoch": 1.396694214876033, "grad_norm": 0.2855883993141657, "learning_rate": 3.4443952456070416e-05, "loss": 0.5499, "num_tokens": 2915995692.0, "step": 698 }, { "epoch": 1.3986977210117706, "grad_norm": 0.2309346034557157, "learning_rate": 3.442705826056846e-05, "loss": 0.5504, "num_tokens": 2920189996.0, "step": 699 }, { "epoch": 1.400701227147508, "grad_norm": 0.31058546145507915, "learning_rate": 3.441014312214395e-05, "loss": 0.5418, "num_tokens": 2924384300.0, "step": 700 }, { "epoch": 1.4027047332832456, "grad_norm": 0.26644062469524715, "learning_rate": 3.439320706930342e-05, "loss": 0.5431, "num_tokens": 2928578604.0, "step": 701 }, { "epoch": 1.4047082394189832, "grad_norm": 0.26980311921426525, "learning_rate": 3.4376250130588657e-05, "loss": 0.5465, "num_tokens": 2932772908.0, "step": 702 }, { "epoch": 1.4067117455547207, "grad_norm": 0.19610323768871785, "learning_rate": 3.4359272334576644e-05, "loss": 0.5458, "num_tokens": 2936962139.0, "step": 703 }, { "epoch": 1.4087152516904582, "grad_norm": 0.2359126280123898, "learning_rate": 3.434227370987953e-05, "loss": 0.5393, "num_tokens": 2941156443.0, "step": 704 }, { "epoch": 1.4107187578261957, "grad_norm": 0.19077986942269368, "learning_rate": 3.4325254285144516e-05, "loss": 0.5607, "num_tokens": 2945350747.0, "step": 705 }, { "epoch": 1.4127222639619335, "grad_norm": 0.23952654466165224, "learning_rate": 3.430821408905393e-05, "loss": 0.5484, "num_tokens": 2949545051.0, "step": 706 }, { "epoch": 1.414725770097671, "grad_norm": 0.20505117710979917, "learning_rate": 3.429115315032505e-05, "loss": 0.552, "num_tokens": 2953739355.0, "step": 707 }, { "epoch": 1.4167292762334085, "grad_norm": 0.21993643332997337, "learning_rate": 3.427407149771014e-05, "loss": 0.5472, "num_tokens": 2957933659.0, "step": 708 }, { "epoch": 1.418732782369146, "grad_norm": 0.20563553362863812, "learning_rate": 3.425696915999635e-05, "loss": 0.5463, "num_tokens": 2962077752.0, "step": 709 }, { "epoch": 1.4207362885048835, "grad_norm": 0.1584582091804748, "learning_rate": 3.42398461660057e-05, "loss": 0.5539, "num_tokens": 2966272056.0, "step": 710 }, { "epoch": 1.422739794640621, "grad_norm": 0.1940864266117268, "learning_rate": 3.422270254459503e-05, "loss": 0.5484, "num_tokens": 2970459531.0, "step": 711 }, { "epoch": 1.4247433007763586, "grad_norm": 0.18517859237993, "learning_rate": 3.420553832465593e-05, "loss": 0.5626, "num_tokens": 2974653835.0, "step": 712 }, { "epoch": 1.426746806912096, "grad_norm": 0.14511947563466235, "learning_rate": 3.4188353535114715e-05, "loss": 0.5371, "num_tokens": 2978840963.0, "step": 713 }, { "epoch": 1.4287503130478338, "grad_norm": 0.20908058403750637, "learning_rate": 3.417114820493235e-05, "loss": 0.5501, "num_tokens": 2983035267.0, "step": 714 }, { "epoch": 1.4307538191835714, "grad_norm": 0.21390089264480122, "learning_rate": 3.4153922363104436e-05, "loss": 0.5412, "num_tokens": 2987229571.0, "step": 715 }, { "epoch": 1.4327573253193089, "grad_norm": 0.14355027792841885, "learning_rate": 3.413667603866113e-05, "loss": 0.5417, "num_tokens": 2991423875.0, "step": 716 }, { "epoch": 1.4347608314550464, "grad_norm": 0.20408401732106654, "learning_rate": 3.41194092606671e-05, "loss": 0.5452, "num_tokens": 2995599795.0, "step": 717 }, { "epoch": 1.436764337590784, "grad_norm": 0.13312907661466644, "learning_rate": 3.41021220582215e-05, "loss": 0.5591, "num_tokens": 2999794099.0, "step": 718 }, { "epoch": 1.4387678437265214, "grad_norm": 0.1596687094645453, "learning_rate": 3.40848144604579e-05, "loss": 0.5499, "num_tokens": 3003988403.0, "step": 719 }, { "epoch": 1.440771349862259, "grad_norm": 0.1335392305255282, "learning_rate": 3.406748649654423e-05, "loss": 0.5462, "num_tokens": 3008156793.0, "step": 720 }, { "epoch": 1.4427748559979965, "grad_norm": 0.17997746411269896, "learning_rate": 3.405013819568276e-05, "loss": 0.5501, "num_tokens": 3012351097.0, "step": 721 }, { "epoch": 1.444778362133734, "grad_norm": 0.14962870434568742, "learning_rate": 3.403276958711003e-05, "loss": 0.5419, "num_tokens": 3016495378.0, "step": 722 }, { "epoch": 1.4467818682694715, "grad_norm": 0.20062465523115863, "learning_rate": 3.4015380700096785e-05, "loss": 0.5454, "num_tokens": 3020689682.0, "step": 723 }, { "epoch": 1.448785374405209, "grad_norm": 0.14861007229240075, "learning_rate": 3.399797156394796e-05, "loss": 0.5614, "num_tokens": 3024883986.0, "step": 724 }, { "epoch": 1.4507888805409466, "grad_norm": 0.17893036313310579, "learning_rate": 3.398054220800263e-05, "loss": 0.5615, "num_tokens": 3029078290.0, "step": 725 }, { "epoch": 1.452792386676684, "grad_norm": 0.16873913211058597, "learning_rate": 3.3963092661633906e-05, "loss": 0.5479, "num_tokens": 3033228772.0, "step": 726 }, { "epoch": 1.4547958928124216, "grad_norm": 0.15081368033118012, "learning_rate": 3.394562295424897e-05, "loss": 0.553, "num_tokens": 3037403839.0, "step": 727 }, { "epoch": 1.4567993989481594, "grad_norm": 0.16013268465255892, "learning_rate": 3.3928133115288945e-05, "loss": 0.5466, "num_tokens": 3041596853.0, "step": 728 }, { "epoch": 1.4588029050838969, "grad_norm": 0.15687045308934577, "learning_rate": 3.3910623174228894e-05, "loss": 0.556, "num_tokens": 3045791157.0, "step": 729 }, { "epoch": 1.4608064112196344, "grad_norm": 0.17164237366263047, "learning_rate": 3.389309316057777e-05, "loss": 0.5457, "num_tokens": 3049964583.0, "step": 730 }, { "epoch": 1.462809917355372, "grad_norm": 0.14569603440041728, "learning_rate": 3.3875543103878334e-05, "loss": 0.5523, "num_tokens": 3054121925.0, "step": 731 }, { "epoch": 1.4648134234911094, "grad_norm": 0.1954251890548124, "learning_rate": 3.385797303370714e-05, "loss": 0.5636, "num_tokens": 3058297500.0, "step": 732 }, { "epoch": 1.466816929626847, "grad_norm": 0.16388019037311227, "learning_rate": 3.384038297967446e-05, "loss": 0.5423, "num_tokens": 3062491804.0, "step": 733 }, { "epoch": 1.4688204357625845, "grad_norm": 0.15093537177028876, "learning_rate": 3.382277297142425e-05, "loss": 0.5592, "num_tokens": 3066664254.0, "step": 734 }, { "epoch": 1.470823941898322, "grad_norm": 0.16416732741527917, "learning_rate": 3.3805143038634084e-05, "loss": 0.5429, "num_tokens": 3070832609.0, "step": 735 }, { "epoch": 1.4728274480340597, "grad_norm": 0.19446288433633166, "learning_rate": 3.3787493211015134e-05, "loss": 0.5376, "num_tokens": 3075026913.0, "step": 736 }, { "epoch": 1.4748309541697973, "grad_norm": 0.15077113639849232, "learning_rate": 3.376982351831208e-05, "loss": 0.5482, "num_tokens": 3079198507.0, "step": 737 }, { "epoch": 1.4768344603055348, "grad_norm": 0.1792736159862218, "learning_rate": 3.3752133990303083e-05, "loss": 0.5526, "num_tokens": 3083392811.0, "step": 738 }, { "epoch": 1.4788379664412723, "grad_norm": 0.1744272347012629, "learning_rate": 3.3734424656799755e-05, "loss": 0.5418, "num_tokens": 3087582772.0, "step": 739 }, { "epoch": 1.4808414725770098, "grad_norm": 0.14235844536056458, "learning_rate": 3.371669554764706e-05, "loss": 0.5361, "num_tokens": 3091777076.0, "step": 740 }, { "epoch": 1.4828449787127473, "grad_norm": 0.1643809039163116, "learning_rate": 3.369894669272328e-05, "loss": 0.5321, "num_tokens": 3095971380.0, "step": 741 }, { "epoch": 1.4848484848484849, "grad_norm": 0.13127487209617206, "learning_rate": 3.3681178121940014e-05, "loss": 0.5519, "num_tokens": 3100151746.0, "step": 742 }, { "epoch": 1.4868519909842224, "grad_norm": 0.15092869049444674, "learning_rate": 3.3663389865242045e-05, "loss": 0.5532, "num_tokens": 3104346050.0, "step": 743 }, { "epoch": 1.48885549711996, "grad_norm": 0.14715271201497915, "learning_rate": 3.364558195260737e-05, "loss": 0.5708, "num_tokens": 3108540354.0, "step": 744 }, { "epoch": 1.4908590032556974, "grad_norm": 0.17133058742010013, "learning_rate": 3.3627754414047077e-05, "loss": 0.5366, "num_tokens": 3112734658.0, "step": 745 }, { "epoch": 1.492862509391435, "grad_norm": 0.1709441437788946, "learning_rate": 3.3609907279605356e-05, "loss": 0.5424, "num_tokens": 3116904766.0, "step": 746 }, { "epoch": 1.4948660155271725, "grad_norm": 0.14312891198426062, "learning_rate": 3.35920405793594e-05, "loss": 0.5441, "num_tokens": 3121093951.0, "step": 747 }, { "epoch": 1.49686952166291, "grad_norm": 0.18036982743340443, "learning_rate": 3.3574154343419394e-05, "loss": 0.5448, "num_tokens": 3125288255.0, "step": 748 }, { "epoch": 1.4988730277986475, "grad_norm": 0.13935314541632898, "learning_rate": 3.355624860192843e-05, "loss": 0.5494, "num_tokens": 3129482449.0, "step": 749 }, { "epoch": 1.500876533934385, "grad_norm": 0.16942142518819, "learning_rate": 3.353832338506249e-05, "loss": 0.5608, "num_tokens": 3133676753.0, "step": 750 }, { "epoch": 1.5028800400701228, "grad_norm": 0.17232520901294004, "learning_rate": 3.3520378723030365e-05, "loss": 0.5583, "num_tokens": 3137871057.0, "step": 751 }, { "epoch": 1.5048835462058603, "grad_norm": 0.17085665391602356, "learning_rate": 3.3502414646073606e-05, "loss": 0.5466, "num_tokens": 3142065361.0, "step": 752 }, { "epoch": 1.5068870523415978, "grad_norm": 0.1808894693369074, "learning_rate": 3.348443118446651e-05, "loss": 0.5441, "num_tokens": 3146259665.0, "step": 753 }, { "epoch": 1.5088905584773353, "grad_norm": 0.15946167842139772, "learning_rate": 3.3466428368516014e-05, "loss": 0.5498, "num_tokens": 3150381513.0, "step": 754 }, { "epoch": 1.5108940646130729, "grad_norm": 0.21725672268303625, "learning_rate": 3.3448406228561696e-05, "loss": 0.5533, "num_tokens": 3154575817.0, "step": 755 }, { "epoch": 1.5128975707488104, "grad_norm": 0.19902756256857163, "learning_rate": 3.343036479497569e-05, "loss": 0.5556, "num_tokens": 3158723405.0, "step": 756 }, { "epoch": 1.5149010768845481, "grad_norm": 0.16998079563726595, "learning_rate": 3.3412304098162644e-05, "loss": 0.5418, "num_tokens": 3162917709.0, "step": 757 }, { "epoch": 1.5169045830202856, "grad_norm": 0.18776258635389023, "learning_rate": 3.339422416855967e-05, "loss": 0.5546, "num_tokens": 3167112013.0, "step": 758 }, { "epoch": 1.5189080891560232, "grad_norm": 0.16932119786886057, "learning_rate": 3.337612503663629e-05, "loss": 0.5467, "num_tokens": 3171306317.0, "step": 759 }, { "epoch": 1.5209115952917607, "grad_norm": 0.18450538019718757, "learning_rate": 3.3358006732894395e-05, "loss": 0.538, "num_tokens": 3175500621.0, "step": 760 }, { "epoch": 1.5229151014274982, "grad_norm": 0.19255109848432544, "learning_rate": 3.3339869287868177e-05, "loss": 0.5479, "num_tokens": 3179637633.0, "step": 761 }, { "epoch": 1.5249186075632357, "grad_norm": 0.22131819112158932, "learning_rate": 3.33217127321241e-05, "loss": 0.5525, "num_tokens": 3183831937.0, "step": 762 }, { "epoch": 1.5269221136989732, "grad_norm": 0.15023312398354008, "learning_rate": 3.330353709626081e-05, "loss": 0.5498, "num_tokens": 3187987000.0, "step": 763 }, { "epoch": 1.5289256198347108, "grad_norm": 0.18249259817353078, "learning_rate": 3.328534241090913e-05, "loss": 0.55, "num_tokens": 3192181304.0, "step": 764 }, { "epoch": 1.5309291259704483, "grad_norm": 0.19922336573516097, "learning_rate": 3.326712870673199e-05, "loss": 0.5374, "num_tokens": 3196375608.0, "step": 765 }, { "epoch": 1.5329326321061858, "grad_norm": 0.1691602522147193, "learning_rate": 3.324889601442433e-05, "loss": 0.5452, "num_tokens": 3200569912.0, "step": 766 }, { "epoch": 1.5349361382419233, "grad_norm": 0.16635907319864318, "learning_rate": 3.323064436471316e-05, "loss": 0.5682, "num_tokens": 3204764216.0, "step": 767 }, { "epoch": 1.5369396443776608, "grad_norm": 0.1491064739692957, "learning_rate": 3.321237378835738e-05, "loss": 0.5585, "num_tokens": 3208950219.0, "step": 768 }, { "epoch": 1.5389431505133984, "grad_norm": 0.14824605164880217, "learning_rate": 3.31940843161478e-05, "loss": 0.5435, "num_tokens": 3213144523.0, "step": 769 }, { "epoch": 1.5409466566491359, "grad_norm": 0.17120561828131925, "learning_rate": 3.3175775978907093e-05, "loss": 0.5559, "num_tokens": 3217301084.0, "step": 770 }, { "epoch": 1.5429501627848734, "grad_norm": 0.1602758608285182, "learning_rate": 3.3157448807489716e-05, "loss": 0.5449, "num_tokens": 3221495388.0, "step": 771 }, { "epoch": 1.544953668920611, "grad_norm": 0.14506015790265292, "learning_rate": 3.313910283278185e-05, "loss": 0.5378, "num_tokens": 3225689692.0, "step": 772 }, { "epoch": 1.5469571750563484, "grad_norm": 0.15350405449646404, "learning_rate": 3.312073808570138e-05, "loss": 0.548, "num_tokens": 3229883996.0, "step": 773 }, { "epoch": 1.5489606811920862, "grad_norm": 0.14707439759822336, "learning_rate": 3.310235459719782e-05, "loss": 0.551, "num_tokens": 3234050967.0, "step": 774 }, { "epoch": 1.5509641873278237, "grad_norm": 0.1441819529924431, "learning_rate": 3.308395239825229e-05, "loss": 0.5407, "num_tokens": 3238242700.0, "step": 775 }, { "epoch": 1.5529676934635612, "grad_norm": 0.12369102758608712, "learning_rate": 3.3065531519877424e-05, "loss": 0.5445, "num_tokens": 3242437004.0, "step": 776 }, { "epoch": 1.5549711995992987, "grad_norm": 0.17927608864748862, "learning_rate": 3.304709199311732e-05, "loss": 0.5473, "num_tokens": 3246631308.0, "step": 777 }, { "epoch": 1.5569747057350363, "grad_norm": 0.23951056460233072, "learning_rate": 3.3028633849047525e-05, "loss": 0.5334, "num_tokens": 3250817464.0, "step": 778 }, { "epoch": 1.558978211870774, "grad_norm": 0.15098307613047882, "learning_rate": 3.301015711877497e-05, "loss": 0.5444, "num_tokens": 3255007105.0, "step": 779 }, { "epoch": 1.5609817180065115, "grad_norm": 0.22157042577833092, "learning_rate": 3.299166183343789e-05, "loss": 0.5479, "num_tokens": 3259184458.0, "step": 780 }, { "epoch": 1.562985224142249, "grad_norm": 0.1928033951347434, "learning_rate": 3.297314802420579e-05, "loss": 0.5539, "num_tokens": 3263378762.0, "step": 781 }, { "epoch": 1.5649887302779866, "grad_norm": 0.12020737458208786, "learning_rate": 3.2954615722279406e-05, "loss": 0.527, "num_tokens": 3267573066.0, "step": 782 }, { "epoch": 1.566992236413724, "grad_norm": 0.16939868333134722, "learning_rate": 3.293606495889063e-05, "loss": 0.5427, "num_tokens": 3271757689.0, "step": 783 }, { "epoch": 1.5689957425494616, "grad_norm": 0.158207450649658, "learning_rate": 3.2917495765302465e-05, "loss": 0.5561, "num_tokens": 3275951993.0, "step": 784 }, { "epoch": 1.5709992486851991, "grad_norm": 0.17490931987214176, "learning_rate": 3.289890817280897e-05, "loss": 0.5454, "num_tokens": 3280128312.0, "step": 785 }, { "epoch": 1.5730027548209367, "grad_norm": 0.13281907784933764, "learning_rate": 3.2880302212735235e-05, "loss": 0.5459, "num_tokens": 3284285175.0, "step": 786 }, { "epoch": 1.5750062609566742, "grad_norm": 0.1392293524576943, "learning_rate": 3.286167791643728e-05, "loss": 0.5483, "num_tokens": 3288479479.0, "step": 787 }, { "epoch": 1.5770097670924117, "grad_norm": 0.164685745415533, "learning_rate": 3.284303531530203e-05, "loss": 0.5446, "num_tokens": 3292673783.0, "step": 788 }, { "epoch": 1.5790132732281492, "grad_norm": 0.18568988322123245, "learning_rate": 3.282437444074727e-05, "loss": 0.5431, "num_tokens": 3296833503.0, "step": 789 }, { "epoch": 1.5810167793638867, "grad_norm": 0.12597967759856848, "learning_rate": 3.280569532422157e-05, "loss": 0.5432, "num_tokens": 3301027807.0, "step": 790 }, { "epoch": 1.5830202854996243, "grad_norm": 0.1853630514230512, "learning_rate": 3.278699799720425e-05, "loss": 0.5473, "num_tokens": 3305218067.0, "step": 791 }, { "epoch": 1.5850237916353618, "grad_norm": 0.15666070110343708, "learning_rate": 3.27682824912053e-05, "loss": 0.5475, "num_tokens": 3309412371.0, "step": 792 }, { "epoch": 1.5870272977710993, "grad_norm": 0.14742566669211762, "learning_rate": 3.2749548837765384e-05, "loss": 0.5529, "num_tokens": 3313585117.0, "step": 793 }, { "epoch": 1.5890308039068368, "grad_norm": 0.15444101075701122, "learning_rate": 3.2730797068455705e-05, "loss": 0.538, "num_tokens": 3317779421.0, "step": 794 }, { "epoch": 1.5910343100425743, "grad_norm": 0.1600396739739905, "learning_rate": 3.271202721487803e-05, "loss": 0.5347, "num_tokens": 3321973725.0, "step": 795 }, { "epoch": 1.593037816178312, "grad_norm": 0.17449899946677686, "learning_rate": 3.26932393086646e-05, "loss": 0.5494, "num_tokens": 3326168029.0, "step": 796 }, { "epoch": 1.5950413223140496, "grad_norm": 0.24727585710491995, "learning_rate": 3.267443338147805e-05, "loss": 0.5369, "num_tokens": 3330351496.0, "step": 797 }, { "epoch": 1.5970448284497871, "grad_norm": 0.1668776508859028, "learning_rate": 3.2655609465011416e-05, "loss": 0.5475, "num_tokens": 3334545800.0, "step": 798 }, { "epoch": 1.5990483345855246, "grad_norm": 0.21040150748488046, "learning_rate": 3.263676759098804e-05, "loss": 0.5586, "num_tokens": 3338740104.0, "step": 799 }, { "epoch": 1.6010518407212622, "grad_norm": 0.22704360770761262, "learning_rate": 3.261790779116153e-05, "loss": 0.5611, "num_tokens": 3342934408.0, "step": 800 }, { "epoch": 1.603055346857, "grad_norm": 0.1870941006637836, "learning_rate": 3.2599030097315696e-05, "loss": 0.5413, "num_tokens": 3347128712.0, "step": 801 }, { "epoch": 1.6050588529927374, "grad_norm": 0.2257608706783097, "learning_rate": 3.258013454126452e-05, "loss": 0.5656, "num_tokens": 3351318087.0, "step": 802 }, { "epoch": 1.607062359128475, "grad_norm": 0.24040541224386916, "learning_rate": 3.256122115485207e-05, "loss": 0.5438, "num_tokens": 3355512391.0, "step": 803 }, { "epoch": 1.6090658652642125, "grad_norm": 0.18767897446480142, "learning_rate": 3.2542289969952456e-05, "loss": 0.5535, "num_tokens": 3359706695.0, "step": 804 }, { "epoch": 1.61106937139995, "grad_norm": 0.2656596614596137, "learning_rate": 3.252334101846982e-05, "loss": 0.5551, "num_tokens": 3363900999.0, "step": 805 }, { "epoch": 1.6130728775356875, "grad_norm": 0.18654169196477227, "learning_rate": 3.250437433233821e-05, "loss": 0.5529, "num_tokens": 3368078076.0, "step": 806 }, { "epoch": 1.615076383671425, "grad_norm": 0.226634402367485, "learning_rate": 3.248538994352158e-05, "loss": 0.5378, "num_tokens": 3372272380.0, "step": 807 }, { "epoch": 1.6170798898071626, "grad_norm": 0.21120195908035264, "learning_rate": 3.246638788401369e-05, "loss": 0.5308, "num_tokens": 3376466684.0, "step": 808 }, { "epoch": 1.6190833959429, "grad_norm": 0.16375832950208952, "learning_rate": 3.244736818583813e-05, "loss": 0.5569, "num_tokens": 3380660988.0, "step": 809 }, { "epoch": 1.6210869020786376, "grad_norm": 0.1637567444812575, "learning_rate": 3.2428330881048176e-05, "loss": 0.5391, "num_tokens": 3384855292.0, "step": 810 }, { "epoch": 1.6230904082143751, "grad_norm": 0.13923004032587294, "learning_rate": 3.2409276001726784e-05, "loss": 0.534, "num_tokens": 3389045521.0, "step": 811 }, { "epoch": 1.6250939143501126, "grad_norm": 0.14260332041844576, "learning_rate": 3.239020357998653e-05, "loss": 0.5293, "num_tokens": 3393239825.0, "step": 812 }, { "epoch": 1.6270974204858502, "grad_norm": 0.12216201172758549, "learning_rate": 3.2371113647969554e-05, "loss": 0.5633, "num_tokens": 3397395862.0, "step": 813 }, { "epoch": 1.6291009266215877, "grad_norm": 0.13248194178754022, "learning_rate": 3.235200623784752e-05, "loss": 0.5344, "num_tokens": 3401565195.0, "step": 814 }, { "epoch": 1.6311044327573252, "grad_norm": 0.13821192647543215, "learning_rate": 3.2332881381821505e-05, "loss": 0.5548, "num_tokens": 3405759499.0, "step": 815 }, { "epoch": 1.6331079388930627, "grad_norm": 0.13893377498854398, "learning_rate": 3.2313739112122045e-05, "loss": 0.5391, "num_tokens": 3409925608.0, "step": 816 }, { "epoch": 1.6351114450288002, "grad_norm": 0.13582500037287767, "learning_rate": 3.229457946100897e-05, "loss": 0.5577, "num_tokens": 3414119912.0, "step": 817 }, { "epoch": 1.637114951164538, "grad_norm": 0.12669924228011795, "learning_rate": 3.227540246077144e-05, "loss": 0.5411, "num_tokens": 3418314216.0, "step": 818 }, { "epoch": 1.6391184573002755, "grad_norm": 0.16684286594411438, "learning_rate": 3.225620814372784e-05, "loss": 0.5534, "num_tokens": 3422502896.0, "step": 819 }, { "epoch": 1.641121963436013, "grad_norm": 0.17065215699870606, "learning_rate": 3.223699654222572e-05, "loss": 0.541, "num_tokens": 3426697200.0, "step": 820 }, { "epoch": 1.6431254695717505, "grad_norm": 0.14672338592271636, "learning_rate": 3.221776768864178e-05, "loss": 0.5353, "num_tokens": 3430891504.0, "step": 821 }, { "epoch": 1.645128975707488, "grad_norm": 0.17556760051681092, "learning_rate": 3.219852161538181e-05, "loss": 0.5468, "num_tokens": 3435061219.0, "step": 822 }, { "epoch": 1.6471324818432258, "grad_norm": 0.16746049878047856, "learning_rate": 3.2179258354880585e-05, "loss": 0.5489, "num_tokens": 3439255523.0, "step": 823 }, { "epoch": 1.6491359879789633, "grad_norm": 0.13989780289977527, "learning_rate": 3.215997793960185e-05, "loss": 0.5373, "num_tokens": 3443449827.0, "step": 824 }, { "epoch": 1.6511394941147008, "grad_norm": 0.1658834548027728, "learning_rate": 3.214068040203828e-05, "loss": 0.5427, "num_tokens": 3447622692.0, "step": 825 }, { "epoch": 1.6531430002504384, "grad_norm": 0.12225670841561433, "learning_rate": 3.21213657747114e-05, "loss": 0.5452, "num_tokens": 3451816996.0, "step": 826 }, { "epoch": 1.6551465063861759, "grad_norm": 0.18412541265176205, "learning_rate": 3.2102034090171534e-05, "loss": 0.5489, "num_tokens": 3456011300.0, "step": 827 }, { "epoch": 1.6571500125219134, "grad_norm": 0.14968453505702436, "learning_rate": 3.208268538099774e-05, "loss": 0.5408, "num_tokens": 3460159707.0, "step": 828 }, { "epoch": 1.659153518657651, "grad_norm": 0.15615484749096373, "learning_rate": 3.206331967979778e-05, "loss": 0.5426, "num_tokens": 3464323276.0, "step": 829 }, { "epoch": 1.6611570247933884, "grad_norm": 0.15332782370291623, "learning_rate": 3.204393701920805e-05, "loss": 0.5415, "num_tokens": 3468502246.0, "step": 830 }, { "epoch": 1.663160530929126, "grad_norm": 0.1725040306075287, "learning_rate": 3.202453743189352e-05, "loss": 0.5362, "num_tokens": 3472696550.0, "step": 831 }, { "epoch": 1.6651640370648635, "grad_norm": 0.14781871257843598, "learning_rate": 3.20051209505477e-05, "loss": 0.536, "num_tokens": 3476885709.0, "step": 832 }, { "epoch": 1.667167543200601, "grad_norm": 0.1498070582592841, "learning_rate": 3.198568760789257e-05, "loss": 0.5382, "num_tokens": 3481049973.0, "step": 833 }, { "epoch": 1.6691710493363385, "grad_norm": 0.13798882076278646, "learning_rate": 3.1966237436678505e-05, "loss": 0.5446, "num_tokens": 3485244277.0, "step": 834 }, { "epoch": 1.671174555472076, "grad_norm": 0.155111052242888, "learning_rate": 3.194677046968425e-05, "loss": 0.5291, "num_tokens": 3489378211.0, "step": 835 }, { "epoch": 1.6731780616078136, "grad_norm": 0.15913383158274227, "learning_rate": 3.192728673971687e-05, "loss": 0.537, "num_tokens": 3493560756.0, "step": 836 }, { "epoch": 1.675181567743551, "grad_norm": 0.13510182443445887, "learning_rate": 3.1907786279611676e-05, "loss": 0.5473, "num_tokens": 3497726585.0, "step": 837 }, { "epoch": 1.6771850738792886, "grad_norm": 0.12136977634178987, "learning_rate": 3.188826912223215e-05, "loss": 0.5394, "num_tokens": 3501920581.0, "step": 838 }, { "epoch": 1.6791885800150261, "grad_norm": 0.15552650086026126, "learning_rate": 3.186873530046996e-05, "loss": 0.5364, "num_tokens": 3506114885.0, "step": 839 }, { "epoch": 1.6811920861507639, "grad_norm": 0.1348148577621133, "learning_rate": 3.184918484724479e-05, "loss": 0.5557, "num_tokens": 3510309189.0, "step": 840 }, { "epoch": 1.6831955922865014, "grad_norm": 0.1645180524885054, "learning_rate": 3.182961779550443e-05, "loss": 0.551, "num_tokens": 3514503493.0, "step": 841 }, { "epoch": 1.685199098422239, "grad_norm": 0.14860876602023382, "learning_rate": 3.181003417822458e-05, "loss": 0.546, "num_tokens": 3518697797.0, "step": 842 }, { "epoch": 1.6872026045579764, "grad_norm": 0.13354923972230237, "learning_rate": 3.179043402840889e-05, "loss": 0.5337, "num_tokens": 3522892101.0, "step": 843 }, { "epoch": 1.689206110693714, "grad_norm": 0.1408605275824236, "learning_rate": 3.177081737908885e-05, "loss": 0.5484, "num_tokens": 3527058096.0, "step": 844 }, { "epoch": 1.6912096168294517, "grad_norm": 0.14020682267417103, "learning_rate": 3.175118426332379e-05, "loss": 0.5382, "num_tokens": 3531252400.0, "step": 845 }, { "epoch": 1.6932131229651892, "grad_norm": 0.13744871924696253, "learning_rate": 3.1731534714200765e-05, "loss": 0.5415, "num_tokens": 3535446704.0, "step": 846 }, { "epoch": 1.6952166291009267, "grad_norm": 0.14749753935689622, "learning_rate": 3.1711868764834515e-05, "loss": 0.5408, "num_tokens": 3539641008.0, "step": 847 }, { "epoch": 1.6972201352366643, "grad_norm": 0.1476688447291076, "learning_rate": 3.1692186448367454e-05, "loss": 0.5506, "num_tokens": 3543835312.0, "step": 848 }, { "epoch": 1.6992236413724018, "grad_norm": 0.14254673436012016, "learning_rate": 3.167248779796955e-05, "loss": 0.5291, "num_tokens": 3547996761.0, "step": 849 }, { "epoch": 1.7012271475081393, "grad_norm": 0.1616479703559933, "learning_rate": 3.16527728468383e-05, "loss": 0.5355, "num_tokens": 3552191065.0, "step": 850 }, { "epoch": 1.7032306536438768, "grad_norm": 0.1724301289190588, "learning_rate": 3.163304162819869e-05, "loss": 0.5331, "num_tokens": 3556385369.0, "step": 851 }, { "epoch": 1.7052341597796143, "grad_norm": 0.1483217382702013, "learning_rate": 3.1613294175303106e-05, "loss": 0.5351, "num_tokens": 3560579673.0, "step": 852 }, { "epoch": 1.7072376659153519, "grad_norm": 0.1813219145102606, "learning_rate": 3.159353052143129e-05, "loss": 0.5399, "num_tokens": 3564765577.0, "step": 853 }, { "epoch": 1.7092411720510894, "grad_norm": 0.1491818111031843, "learning_rate": 3.15737506998903e-05, "loss": 0.5521, "num_tokens": 3568940787.0, "step": 854 }, { "epoch": 1.711244678186827, "grad_norm": 0.13184576794394326, "learning_rate": 3.155395474401443e-05, "loss": 0.548, "num_tokens": 3573064521.0, "step": 855 }, { "epoch": 1.7132481843225644, "grad_norm": 0.1430761021476999, "learning_rate": 3.153414268716518e-05, "loss": 0.529, "num_tokens": 3577219808.0, "step": 856 }, { "epoch": 1.715251690458302, "grad_norm": 0.1513108572145931, "learning_rate": 3.151431456273116e-05, "loss": 0.538, "num_tokens": 3581388017.0, "step": 857 }, { "epoch": 1.7172551965940395, "grad_norm": 0.1387982069435468, "learning_rate": 3.149447040412807e-05, "loss": 0.5405, "num_tokens": 3585582321.0, "step": 858 }, { "epoch": 1.719258702729777, "grad_norm": 0.1603983224177435, "learning_rate": 3.147461024479866e-05, "loss": 0.5363, "num_tokens": 3589769484.0, "step": 859 }, { "epoch": 1.7212622088655145, "grad_norm": 0.16041032379661938, "learning_rate": 3.145473411821259e-05, "loss": 0.5503, "num_tokens": 3593963788.0, "step": 860 }, { "epoch": 1.723265715001252, "grad_norm": 0.23475657523759563, "learning_rate": 3.143484205786647e-05, "loss": 0.5471, "num_tokens": 3598158092.0, "step": 861 }, { "epoch": 1.7252692211369898, "grad_norm": 0.1428397815019273, "learning_rate": 3.141493409728376e-05, "loss": 0.5385, "num_tokens": 3602300301.0, "step": 862 }, { "epoch": 1.7272727272727273, "grad_norm": 0.20686519346157536, "learning_rate": 3.1395010270014684e-05, "loss": 0.5414, "num_tokens": 3606494605.0, "step": 863 }, { "epoch": 1.7292762334084648, "grad_norm": 0.12292126063661239, "learning_rate": 3.137507060963625e-05, "loss": 0.5388, "num_tokens": 3610657572.0, "step": 864 }, { "epoch": 1.7312797395442023, "grad_norm": 0.16289989445328856, "learning_rate": 3.135511514975213e-05, "loss": 0.5525, "num_tokens": 3614831071.0, "step": 865 }, { "epoch": 1.7332832456799399, "grad_norm": 0.13901404580896637, "learning_rate": 3.1335143923992594e-05, "loss": 0.5314, "num_tokens": 3619008290.0, "step": 866 }, { "epoch": 1.7352867518156776, "grad_norm": 0.15578205097213613, "learning_rate": 3.1315156966014535e-05, "loss": 0.5443, "num_tokens": 3623178136.0, "step": 867 }, { "epoch": 1.7372902579514151, "grad_norm": 0.14440376961338228, "learning_rate": 3.129515430950132e-05, "loss": 0.5501, "num_tokens": 3627357412.0, "step": 868 }, { "epoch": 1.7392937640871526, "grad_norm": 0.1630523877209803, "learning_rate": 3.127513598816279e-05, "loss": 0.5401, "num_tokens": 3631551716.0, "step": 869 }, { "epoch": 1.7412972702228902, "grad_norm": 0.14026909299000495, "learning_rate": 3.1255102035735165e-05, "loss": 0.5362, "num_tokens": 3635740712.0, "step": 870 }, { "epoch": 1.7433007763586277, "grad_norm": 0.1544298778143068, "learning_rate": 3.123505248598104e-05, "loss": 0.5504, "num_tokens": 3639920978.0, "step": 871 }, { "epoch": 1.7453042824943652, "grad_norm": 0.14747727065855198, "learning_rate": 3.121498737268926e-05, "loss": 0.5549, "num_tokens": 3644113558.0, "step": 872 }, { "epoch": 1.7473077886301027, "grad_norm": 0.14587915831553191, "learning_rate": 3.1194906729674945e-05, "loss": 0.5277, "num_tokens": 3648275872.0, "step": 873 }, { "epoch": 1.7493112947658402, "grad_norm": 0.16238287180529853, "learning_rate": 3.117481059077933e-05, "loss": 0.5391, "num_tokens": 3652458743.0, "step": 874 }, { "epoch": 1.7513148009015778, "grad_norm": 0.1344631797458481, "learning_rate": 3.115469898986981e-05, "loss": 0.5392, "num_tokens": 3656643757.0, "step": 875 }, { "epoch": 1.7533183070373153, "grad_norm": 0.14290532754531787, "learning_rate": 3.1134571960839805e-05, "loss": 0.5411, "num_tokens": 3660808615.0, "step": 876 }, { "epoch": 1.7553218131730528, "grad_norm": 0.14506317383551665, "learning_rate": 3.111442953760876e-05, "loss": 0.5404, "num_tokens": 3664977284.0, "step": 877 }, { "epoch": 1.7573253193087903, "grad_norm": 0.12022048705102993, "learning_rate": 3.109427175412207e-05, "loss": 0.5443, "num_tokens": 3669171588.0, "step": 878 }, { "epoch": 1.7593288254445278, "grad_norm": 0.13725944383440086, "learning_rate": 3.1074098644350976e-05, "loss": 0.5467, "num_tokens": 3673320102.0, "step": 879 }, { "epoch": 1.7613323315802654, "grad_norm": 0.13867206145171732, "learning_rate": 3.105391024229259e-05, "loss": 0.545, "num_tokens": 3677494900.0, "step": 880 }, { "epoch": 1.7633358377160029, "grad_norm": 0.17400262540328307, "learning_rate": 3.1033706581969765e-05, "loss": 0.5295, "num_tokens": 3681668525.0, "step": 881 }, { "epoch": 1.7653393438517404, "grad_norm": 0.11989302395313602, "learning_rate": 3.101348769743109e-05, "loss": 0.5435, "num_tokens": 3685862829.0, "step": 882 }, { "epoch": 1.767342849987478, "grad_norm": 0.1422000941847472, "learning_rate": 3.09932536227508e-05, "loss": 0.5432, "num_tokens": 3690026231.0, "step": 883 }, { "epoch": 1.7693463561232157, "grad_norm": 0.14140567245470703, "learning_rate": 3.097300439202873e-05, "loss": 0.5359, "num_tokens": 3694208982.0, "step": 884 }, { "epoch": 1.7713498622589532, "grad_norm": 0.13832677413842398, "learning_rate": 3.095274003939026e-05, "loss": 0.5525, "num_tokens": 3698403286.0, "step": 885 }, { "epoch": 1.7733533683946907, "grad_norm": 0.15676995439188127, "learning_rate": 3.093246059898625e-05, "loss": 0.536, "num_tokens": 3702553515.0, "step": 886 }, { "epoch": 1.7753568745304282, "grad_norm": 0.14370512698916382, "learning_rate": 3.0912166104993e-05, "loss": 0.5426, "num_tokens": 3706731446.0, "step": 887 }, { "epoch": 1.7773603806661658, "grad_norm": 0.1671958895306106, "learning_rate": 3.089185659161216e-05, "loss": 0.5386, "num_tokens": 3710909806.0, "step": 888 }, { "epoch": 1.7793638868019035, "grad_norm": 0.15325582134592738, "learning_rate": 3.087153209307071e-05, "loss": 0.5238, "num_tokens": 3715104110.0, "step": 889 }, { "epoch": 1.781367392937641, "grad_norm": 0.1750936478695255, "learning_rate": 3.085119264362087e-05, "loss": 0.5346, "num_tokens": 3719288287.0, "step": 890 }, { "epoch": 1.7833708990733785, "grad_norm": 0.19720784100022154, "learning_rate": 3.083083827754006e-05, "loss": 0.555, "num_tokens": 3723482591.0, "step": 891 }, { "epoch": 1.785374405209116, "grad_norm": 0.1177720082045312, "learning_rate": 3.0810469029130846e-05, "loss": 0.5381, "num_tokens": 3727661067.0, "step": 892 }, { "epoch": 1.7873779113448536, "grad_norm": 0.1404500693019182, "learning_rate": 3.079008493272086e-05, "loss": 0.5345, "num_tokens": 3731827318.0, "step": 893 }, { "epoch": 1.789381417480591, "grad_norm": 0.1141134763156892, "learning_rate": 3.076968602266277e-05, "loss": 0.5435, "num_tokens": 3736021622.0, "step": 894 }, { "epoch": 1.7913849236163286, "grad_norm": 0.15486445419273365, "learning_rate": 3.074927233333421e-05, "loss": 0.5426, "num_tokens": 3740189250.0, "step": 895 }, { "epoch": 1.7933884297520661, "grad_norm": 0.13012688592685923, "learning_rate": 3.0728843899137705e-05, "loss": 0.5375, "num_tokens": 3744359581.0, "step": 896 }, { "epoch": 1.7953919358878037, "grad_norm": 0.1369422742996802, "learning_rate": 3.070840075450064e-05, "loss": 0.5326, "num_tokens": 3748553885.0, "step": 897 }, { "epoch": 1.7973954420235412, "grad_norm": 0.13317783872853634, "learning_rate": 3.0687942933875206e-05, "loss": 0.5424, "num_tokens": 3752748189.0, "step": 898 }, { "epoch": 1.7993989481592787, "grad_norm": 0.1425267609612362, "learning_rate": 3.06674704717383e-05, "loss": 0.5396, "num_tokens": 3756938192.0, "step": 899 }, { "epoch": 1.8014024542950162, "grad_norm": 0.15345974269842502, "learning_rate": 3.064698340259151e-05, "loss": 0.5472, "num_tokens": 3761132496.0, "step": 900 }, { "epoch": 1.8034059604307537, "grad_norm": 0.13100607385812002, "learning_rate": 3.062648176096103e-05, "loss": 0.5295, "num_tokens": 3765307119.0, "step": 901 }, { "epoch": 1.8054094665664913, "grad_norm": 0.15052790611651917, "learning_rate": 3.0605965581397624e-05, "loss": 0.5491, "num_tokens": 3769501423.0, "step": 902 }, { "epoch": 1.8074129727022288, "grad_norm": 0.1460877444502167, "learning_rate": 3.0585434898476545e-05, "loss": 0.5385, "num_tokens": 3773692721.0, "step": 903 }, { "epoch": 1.8094164788379663, "grad_norm": 0.14858206574301627, "learning_rate": 3.056488974679751e-05, "loss": 0.5542, "num_tokens": 3777887025.0, "step": 904 }, { "epoch": 1.8114199849737038, "grad_norm": 0.14971968533960325, "learning_rate": 3.0544330160984575e-05, "loss": 0.5447, "num_tokens": 3782081329.0, "step": 905 }, { "epoch": 1.8134234911094416, "grad_norm": 0.12744718589186463, "learning_rate": 3.0523756175686174e-05, "loss": 0.5379, "num_tokens": 3786267791.0, "step": 906 }, { "epoch": 1.815426997245179, "grad_norm": 0.1610090005710843, "learning_rate": 3.0503167825574986e-05, "loss": 0.5535, "num_tokens": 3790462095.0, "step": 907 }, { "epoch": 1.8174305033809166, "grad_norm": 0.14078369391886453, "learning_rate": 3.0482565145347882e-05, "loss": 0.5538, "num_tokens": 3794652008.0, "step": 908 }, { "epoch": 1.8194340095166541, "grad_norm": 0.12983084836525738, "learning_rate": 3.046194816972591e-05, "loss": 0.5443, "num_tokens": 3798846312.0, "step": 909 }, { "epoch": 1.8214375156523916, "grad_norm": 0.1269950682636296, "learning_rate": 3.0441316933454187e-05, "loss": 0.5323, "num_tokens": 3802977300.0, "step": 910 }, { "epoch": 1.8234410217881294, "grad_norm": 0.12564318355942508, "learning_rate": 3.0420671471301887e-05, "loss": 0.5452, "num_tokens": 3807171604.0, "step": 911 }, { "epoch": 1.825444527923867, "grad_norm": 0.1118846721961704, "learning_rate": 3.0400011818062127e-05, "loss": 0.56, "num_tokens": 3811336144.0, "step": 912 }, { "epoch": 1.8274480340596044, "grad_norm": 0.13553072999927918, "learning_rate": 3.0379338008551976e-05, "loss": 0.5381, "num_tokens": 3815530448.0, "step": 913 }, { "epoch": 1.829451540195342, "grad_norm": 0.14151449679723804, "learning_rate": 3.035865007761233e-05, "loss": 0.546, "num_tokens": 3819702741.0, "step": 914 }, { "epoch": 1.8314550463310795, "grad_norm": 0.11772636097302473, "learning_rate": 3.0337948060107902e-05, "loss": 0.5489, "num_tokens": 3823856510.0, "step": 915 }, { "epoch": 1.833458552466817, "grad_norm": 0.13789371944113588, "learning_rate": 3.031723199092713e-05, "loss": 0.5445, "num_tokens": 3828024916.0, "step": 916 }, { "epoch": 1.8354620586025545, "grad_norm": 0.1361764599255284, "learning_rate": 3.029650190498215e-05, "loss": 0.5436, "num_tokens": 3832219220.0, "step": 917 }, { "epoch": 1.837465564738292, "grad_norm": 0.1439148472623406, "learning_rate": 3.0275757837208705e-05, "loss": 0.5292, "num_tokens": 3836408973.0, "step": 918 }, { "epoch": 1.8394690708740296, "grad_norm": 0.13254468880334253, "learning_rate": 3.0254999822566104e-05, "loss": 0.5433, "num_tokens": 3840603277.0, "step": 919 }, { "epoch": 1.841472577009767, "grad_norm": 0.13676063502799618, "learning_rate": 3.0234227896037178e-05, "loss": 0.5383, "num_tokens": 3844713555.0, "step": 920 }, { "epoch": 1.8434760831455046, "grad_norm": 0.13613675146104023, "learning_rate": 3.021344209262816e-05, "loss": 0.5405, "num_tokens": 3848892550.0, "step": 921 }, { "epoch": 1.8454795892812421, "grad_norm": 0.13532850759818502, "learning_rate": 3.0192642447368733e-05, "loss": 0.5298, "num_tokens": 3853075531.0, "step": 922 }, { "epoch": 1.8474830954169796, "grad_norm": 0.16530475593078903, "learning_rate": 3.0171828995311846e-05, "loss": 0.5433, "num_tokens": 3857269835.0, "step": 923 }, { "epoch": 1.8494866015527172, "grad_norm": 0.15424213848756746, "learning_rate": 3.015100177153375e-05, "loss": 0.5404, "num_tokens": 3861433848.0, "step": 924 }, { "epoch": 1.8514901076884547, "grad_norm": 0.13819184832048212, "learning_rate": 3.013016081113389e-05, "loss": 0.533, "num_tokens": 3865598064.0, "step": 925 }, { "epoch": 1.8534936138241922, "grad_norm": 0.1275271410719411, "learning_rate": 3.0109306149234886e-05, "loss": 0.5383, "num_tokens": 3869790674.0, "step": 926 }, { "epoch": 1.8554971199599297, "grad_norm": 0.16836926143977649, "learning_rate": 3.008843782098241e-05, "loss": 0.5511, "num_tokens": 3873971663.0, "step": 927 }, { "epoch": 1.8575006260956675, "grad_norm": 0.14645452713765267, "learning_rate": 3.00675558615452e-05, "loss": 0.5352, "num_tokens": 3878165967.0, "step": 928 }, { "epoch": 1.859504132231405, "grad_norm": 0.19030693435422633, "learning_rate": 3.0046660306114945e-05, "loss": 0.5413, "num_tokens": 3882360271.0, "step": 929 }, { "epoch": 1.8615076383671425, "grad_norm": 0.1540177132839084, "learning_rate": 3.002575118990626e-05, "loss": 0.5429, "num_tokens": 3886554575.0, "step": 930 }, { "epoch": 1.86351114450288, "grad_norm": 0.14600155607831336, "learning_rate": 3.0004828548156607e-05, "loss": 0.5255, "num_tokens": 3890739977.0, "step": 931 }, { "epoch": 1.8655146506386175, "grad_norm": 0.14232395619991478, "learning_rate": 2.998389241612623e-05, "loss": 0.558, "num_tokens": 3894924012.0, "step": 932 }, { "epoch": 1.8675181567743553, "grad_norm": 0.14488729469774225, "learning_rate": 2.9962942829098135e-05, "loss": 0.5412, "num_tokens": 3899067843.0, "step": 933 }, { "epoch": 1.8695216629100928, "grad_norm": 0.11640065377306204, "learning_rate": 2.994197982237798e-05, "loss": 0.5307, "num_tokens": 3903262147.0, "step": 934 }, { "epoch": 1.8715251690458303, "grad_norm": 0.1569259895278892, "learning_rate": 2.9921003431294068e-05, "loss": 0.5243, "num_tokens": 3907430576.0, "step": 935 }, { "epoch": 1.8735286751815678, "grad_norm": 0.18050831699486602, "learning_rate": 2.990001369119721e-05, "loss": 0.5425, "num_tokens": 3911582748.0, "step": 936 }, { "epoch": 1.8755321813173054, "grad_norm": 0.11660751678239911, "learning_rate": 2.9879010637460747e-05, "loss": 0.547, "num_tokens": 3915777052.0, "step": 937 }, { "epoch": 1.877535687453043, "grad_norm": 0.16767491402907514, "learning_rate": 2.985799430548047e-05, "loss": 0.5411, "num_tokens": 3919971356.0, "step": 938 }, { "epoch": 1.8795391935887804, "grad_norm": 0.15541643669165797, "learning_rate": 2.983696473067451e-05, "loss": 0.5379, "num_tokens": 3924128928.0, "step": 939 }, { "epoch": 1.881542699724518, "grad_norm": 0.14685970308687699, "learning_rate": 2.981592194848334e-05, "loss": 0.5389, "num_tokens": 3928299106.0, "step": 940 }, { "epoch": 1.8835462058602555, "grad_norm": 0.12285035268429134, "learning_rate": 2.9794865994369693e-05, "loss": 0.5261, "num_tokens": 3932480650.0, "step": 941 }, { "epoch": 1.885549711995993, "grad_norm": 0.1858789821788028, "learning_rate": 2.9773796903818485e-05, "loss": 0.5453, "num_tokens": 3936674954.0, "step": 942 }, { "epoch": 1.8875532181317305, "grad_norm": 0.12214016262014109, "learning_rate": 2.9752714712336784e-05, "loss": 0.5373, "num_tokens": 3940864243.0, "step": 943 }, { "epoch": 1.889556724267468, "grad_norm": 0.1651416472133058, "learning_rate": 2.9731619455453723e-05, "loss": 0.5277, "num_tokens": 3945044708.0, "step": 944 }, { "epoch": 1.8915602304032055, "grad_norm": 0.16772011189360742, "learning_rate": 2.9710511168720465e-05, "loss": 0.5291, "num_tokens": 3949228381.0, "step": 945 }, { "epoch": 1.893563736538943, "grad_norm": 0.15328005033710848, "learning_rate": 2.9689389887710133e-05, "loss": 0.5344, "num_tokens": 3953422685.0, "step": 946 }, { "epoch": 1.8955672426746806, "grad_norm": 0.14657534499314354, "learning_rate": 2.9668255648017737e-05, "loss": 0.5292, "num_tokens": 3957602998.0, "step": 947 }, { "epoch": 1.897570748810418, "grad_norm": 0.219579873812027, "learning_rate": 2.964710848526014e-05, "loss": 0.5387, "num_tokens": 3961750527.0, "step": 948 }, { "epoch": 1.8995742549461556, "grad_norm": 0.13144443060878386, "learning_rate": 2.962594843507597e-05, "loss": 0.5343, "num_tokens": 3965927684.0, "step": 949 }, { "epoch": 1.9015777610818934, "grad_norm": 0.12615021723306968, "learning_rate": 2.9604775533125583e-05, "loss": 0.5344, "num_tokens": 3970106321.0, "step": 950 }, { "epoch": 1.9035812672176309, "grad_norm": 0.1848476809326013, "learning_rate": 2.9583589815090995e-05, "loss": 0.5388, "num_tokens": 3974300625.0, "step": 951 }, { "epoch": 1.9055847733533684, "grad_norm": 0.13945788539709553, "learning_rate": 2.9562391316675812e-05, "loss": 0.5457, "num_tokens": 3978494929.0, "step": 952 }, { "epoch": 1.907588279489106, "grad_norm": 0.15853928784571514, "learning_rate": 2.9541180073605194e-05, "loss": 0.5357, "num_tokens": 3982689233.0, "step": 953 }, { "epoch": 1.9095917856248434, "grad_norm": 0.18995805874782726, "learning_rate": 2.9519956121625754e-05, "loss": 0.5446, "num_tokens": 3986878403.0, "step": 954 }, { "epoch": 1.911595291760581, "grad_norm": 0.17510488921256476, "learning_rate": 2.9498719496505538e-05, "loss": 0.5393, "num_tokens": 3991064135.0, "step": 955 }, { "epoch": 1.9135987978963187, "grad_norm": 0.11703341119683151, "learning_rate": 2.947747023403396e-05, "loss": 0.5318, "num_tokens": 3995258439.0, "step": 956 }, { "epoch": 1.9156023040320562, "grad_norm": 0.15682374233979451, "learning_rate": 2.9456208370021714e-05, "loss": 0.5375, "num_tokens": 3999436941.0, "step": 957 }, { "epoch": 1.9176058101677937, "grad_norm": 0.13967372832691546, "learning_rate": 2.9434933940300733e-05, "loss": 0.5369, "num_tokens": 4003631245.0, "step": 958 }, { "epoch": 1.9196093163035313, "grad_norm": 0.14666208481054457, "learning_rate": 2.941364698072414e-05, "loss": 0.5479, "num_tokens": 4007825549.0, "step": 959 }, { "epoch": 1.9216128224392688, "grad_norm": 0.13300979005328573, "learning_rate": 2.9392347527166156e-05, "loss": 0.5342, "num_tokens": 4011992013.0, "step": 960 }, { "epoch": 1.9236163285750063, "grad_norm": 0.14162603143922206, "learning_rate": 2.9371035615522078e-05, "loss": 0.5382, "num_tokens": 4016161830.0, "step": 961 }, { "epoch": 1.9256198347107438, "grad_norm": 0.16231726253764667, "learning_rate": 2.9349711281708174e-05, "loss": 0.5345, "num_tokens": 4020338789.0, "step": 962 }, { "epoch": 1.9276233408464813, "grad_norm": 0.13171798014091857, "learning_rate": 2.9328374561661666e-05, "loss": 0.5476, "num_tokens": 4024500508.0, "step": 963 }, { "epoch": 1.9296268469822189, "grad_norm": 0.15809381699255476, "learning_rate": 2.9307025491340642e-05, "loss": 0.5325, "num_tokens": 4028655004.0, "step": 964 }, { "epoch": 1.9316303531179564, "grad_norm": 0.12717156561396187, "learning_rate": 2.928566410672401e-05, "loss": 0.5299, "num_tokens": 4032818633.0, "step": 965 }, { "epoch": 1.933633859253694, "grad_norm": 0.16965154314738576, "learning_rate": 2.9264290443811423e-05, "loss": 0.5418, "num_tokens": 4036999418.0, "step": 966 }, { "epoch": 1.9356373653894314, "grad_norm": 0.1376707825478847, "learning_rate": 2.924290453862323e-05, "loss": 0.5277, "num_tokens": 4041162948.0, "step": 967 }, { "epoch": 1.937640871525169, "grad_norm": 0.1532351245221871, "learning_rate": 2.9221506427200404e-05, "loss": 0.5352, "num_tokens": 4045357252.0, "step": 968 }, { "epoch": 1.9396443776609065, "grad_norm": 0.1681532819648909, "learning_rate": 2.9200096145604497e-05, "loss": 0.539, "num_tokens": 4049504926.0, "step": 969 }, { "epoch": 1.941647883796644, "grad_norm": 0.16031413935878672, "learning_rate": 2.9178673729917583e-05, "loss": 0.5348, "num_tokens": 4053678759.0, "step": 970 }, { "epoch": 1.9436513899323815, "grad_norm": 0.16819890152016992, "learning_rate": 2.915723921624216e-05, "loss": 0.5291, "num_tokens": 4057873063.0, "step": 971 }, { "epoch": 1.9456548960681193, "grad_norm": 0.1470534398244089, "learning_rate": 2.913579264070113e-05, "loss": 0.5385, "num_tokens": 4062067367.0, "step": 972 }, { "epoch": 1.9476584022038568, "grad_norm": 0.13472844235549458, "learning_rate": 2.9114334039437718e-05, "loss": 0.5348, "num_tokens": 4066261671.0, "step": 973 }, { "epoch": 1.9496619083395943, "grad_norm": 0.12433131497859692, "learning_rate": 2.909286344861541e-05, "loss": 0.5304, "num_tokens": 4070455975.0, "step": 974 }, { "epoch": 1.9516654144753318, "grad_norm": 0.17970226983820092, "learning_rate": 2.9071380904417914e-05, "loss": 0.5315, "num_tokens": 4074650279.0, "step": 975 }, { "epoch": 1.9536689206110693, "grad_norm": 0.14847902123109455, "learning_rate": 2.904988644304907e-05, "loss": 0.5391, "num_tokens": 4078819058.0, "step": 976 }, { "epoch": 1.9556724267468069, "grad_norm": 0.14027626505381946, "learning_rate": 2.9028380100732794e-05, "loss": 0.5444, "num_tokens": 4083012557.0, "step": 977 }, { "epoch": 1.9576759328825446, "grad_norm": 0.14883293592940341, "learning_rate": 2.9006861913713053e-05, "loss": 0.5353, "num_tokens": 4087206861.0, "step": 978 }, { "epoch": 1.9596794390182821, "grad_norm": 0.12946662785944124, "learning_rate": 2.898533191825374e-05, "loss": 0.5415, "num_tokens": 4091401165.0, "step": 979 }, { "epoch": 1.9616829451540196, "grad_norm": 0.1320250589970752, "learning_rate": 2.896379015063868e-05, "loss": 0.5439, "num_tokens": 4095566604.0, "step": 980 }, { "epoch": 1.9636864512897572, "grad_norm": 0.14579867754621662, "learning_rate": 2.8942236647171502e-05, "loss": 0.5426, "num_tokens": 4099760908.0, "step": 981 }, { "epoch": 1.9656899574254947, "grad_norm": 0.11896708177961458, "learning_rate": 2.8920671444175658e-05, "loss": 0.5284, "num_tokens": 4103955212.0, "step": 982 }, { "epoch": 1.9676934635612322, "grad_norm": 0.20352098347887299, "learning_rate": 2.8899094577994273e-05, "loss": 0.5219, "num_tokens": 4108127934.0, "step": 983 }, { "epoch": 1.9696969696969697, "grad_norm": 0.15046895254014267, "learning_rate": 2.8877506084990163e-05, "loss": 0.5542, "num_tokens": 4112301192.0, "step": 984 }, { "epoch": 1.9717004758327072, "grad_norm": 0.15297055293989714, "learning_rate": 2.8855906001545712e-05, "loss": 0.5474, "num_tokens": 4116495496.0, "step": 985 }, { "epoch": 1.9737039819684448, "grad_norm": 0.17184321052727286, "learning_rate": 2.8834294364062855e-05, "loss": 0.547, "num_tokens": 4120689800.0, "step": 986 }, { "epoch": 1.9757074881041823, "grad_norm": 0.1535695062915909, "learning_rate": 2.8812671208962984e-05, "loss": 0.5537, "num_tokens": 4124884104.0, "step": 987 }, { "epoch": 1.9777109942399198, "grad_norm": 0.15380164453369521, "learning_rate": 2.8791036572686912e-05, "loss": 0.53, "num_tokens": 4129078408.0, "step": 988 }, { "epoch": 1.9797145003756573, "grad_norm": 0.17617842012773682, "learning_rate": 2.87693904916948e-05, "loss": 0.5385, "num_tokens": 4133272712.0, "step": 989 }, { "epoch": 1.9817180065113948, "grad_norm": 0.1651723773894058, "learning_rate": 2.874773300246609e-05, "loss": 0.5326, "num_tokens": 4137467016.0, "step": 990 }, { "epoch": 1.9837215126471324, "grad_norm": 0.1132558618540398, "learning_rate": 2.872606414149945e-05, "loss": 0.5276, "num_tokens": 4141656315.0, "step": 991 }, { "epoch": 1.9857250187828699, "grad_norm": 0.16242124371331118, "learning_rate": 2.8704383945312723e-05, "loss": 0.542, "num_tokens": 4145817985.0, "step": 992 }, { "epoch": 1.9877285249186074, "grad_norm": 0.14895342852766952, "learning_rate": 2.868269245044285e-05, "loss": 0.5412, "num_tokens": 4150012289.0, "step": 993 }, { "epoch": 1.9897320310543452, "grad_norm": 0.16431337928047318, "learning_rate": 2.8660989693445808e-05, "loss": 0.5352, "num_tokens": 4154184049.0, "step": 994 }, { "epoch": 1.9917355371900827, "grad_norm": 0.16417661005086548, "learning_rate": 2.863927571089655e-05, "loss": 0.5267, "num_tokens": 4158378353.0, "step": 995 }, { "epoch": 1.9937390433258202, "grad_norm": 0.13590472510127624, "learning_rate": 2.8617550539388963e-05, "loss": 0.5476, "num_tokens": 4162550126.0, "step": 996 }, { "epoch": 1.9957425494615577, "grad_norm": 0.1270353151484948, "learning_rate": 2.859581421553578e-05, "loss": 0.5301, "num_tokens": 4166734638.0, "step": 997 }, { "epoch": 1.9977460555972952, "grad_norm": 0.12826860336289422, "learning_rate": 2.857406677596853e-05, "loss": 0.5401, "num_tokens": 4170928942.0, "step": 998 }, { "epoch": 1.9997495617330328, "grad_norm": 0.13839943854259898, "learning_rate": 2.8552308257337472e-05, "loss": 0.5326, "num_tokens": 4175123246.0, "step": 999 }, { "epoch": 2.0, "grad_norm": 0.13839943854259898, "learning_rate": 2.8530538696311537e-05, "loss": 0.5275, "num_tokens": 4175647534.0, "step": 1000 }, { "epoch": 2.0020035061357375, "grad_norm": 0.2704148288608397, "learning_rate": 2.850875812957828e-05, "loss": 0.5074, "num_tokens": 4179841838.0, "step": 1001 }, { "epoch": 2.004007012271475, "grad_norm": 0.18092074208237363, "learning_rate": 2.8486966593843778e-05, "loss": 0.5044, "num_tokens": 4184010043.0, "step": 1002 }, { "epoch": 2.0060105184072126, "grad_norm": 0.16150002794307552, "learning_rate": 2.8465164125832617e-05, "loss": 0.5067, "num_tokens": 4188198877.0, "step": 1003 }, { "epoch": 2.00801402454295, "grad_norm": 0.1565456100070018, "learning_rate": 2.8443350762287788e-05, "loss": 0.5073, "num_tokens": 4192393181.0, "step": 1004 }, { "epoch": 2.0100175306786876, "grad_norm": 0.19057525711019885, "learning_rate": 2.842152653997066e-05, "loss": 0.5154, "num_tokens": 4196580908.0, "step": 1005 }, { "epoch": 2.012021036814425, "grad_norm": 0.18243096409498633, "learning_rate": 2.839969149566089e-05, "loss": 0.5068, "num_tokens": 4200775212.0, "step": 1006 }, { "epoch": 2.0140245429501626, "grad_norm": 0.1491972189095794, "learning_rate": 2.8377845666156377e-05, "loss": 0.4965, "num_tokens": 4204969516.0, "step": 1007 }, { "epoch": 2.0160280490859, "grad_norm": 0.19343472968728828, "learning_rate": 2.83559890882732e-05, "loss": 0.518, "num_tokens": 4209163820.0, "step": 1008 }, { "epoch": 2.0180315552216377, "grad_norm": 0.15245128878609968, "learning_rate": 2.8334121798845546e-05, "loss": 0.509, "num_tokens": 4213358124.0, "step": 1009 }, { "epoch": 2.020035061357375, "grad_norm": 0.1575269192235421, "learning_rate": 2.831224383472566e-05, "loss": 0.5096, "num_tokens": 4217552428.0, "step": 1010 }, { "epoch": 2.022038567493113, "grad_norm": 0.18494759236096875, "learning_rate": 2.8290355232783776e-05, "loss": 0.5084, "num_tokens": 4221744947.0, "step": 1011 }, { "epoch": 2.0240420736288507, "grad_norm": 0.14740236440128146, "learning_rate": 2.8268456029908037e-05, "loss": 0.5191, "num_tokens": 4225925206.0, "step": 1012 }, { "epoch": 2.026045579764588, "grad_norm": 0.1613406525000757, "learning_rate": 2.824654626300448e-05, "loss": 0.5182, "num_tokens": 4230092609.0, "step": 1013 }, { "epoch": 2.0280490859003257, "grad_norm": 0.14986825444297228, "learning_rate": 2.822462596899693e-05, "loss": 0.4942, "num_tokens": 4234272962.0, "step": 1014 }, { "epoch": 2.0300525920360633, "grad_norm": 0.15537594509764743, "learning_rate": 2.8202695184826953e-05, "loss": 0.5024, "num_tokens": 4238428418.0, "step": 1015 }, { "epoch": 2.0320560981718008, "grad_norm": 0.14888078676282712, "learning_rate": 2.8180753947453797e-05, "loss": 0.5154, "num_tokens": 4242592047.0, "step": 1016 }, { "epoch": 2.0340596043075383, "grad_norm": 0.2085967896059447, "learning_rate": 2.815880229385432e-05, "loss": 0.5153, "num_tokens": 4246748116.0, "step": 1017 }, { "epoch": 2.036063110443276, "grad_norm": 0.14667548813545359, "learning_rate": 2.8136840261022955e-05, "loss": 0.5129, "num_tokens": 4250942420.0, "step": 1018 }, { "epoch": 2.0380666165790133, "grad_norm": 0.17019946496880503, "learning_rate": 2.8114867885971594e-05, "loss": 0.514, "num_tokens": 4255136724.0, "step": 1019 }, { "epoch": 2.040070122714751, "grad_norm": 0.19713669890312419, "learning_rate": 2.8092885205729587e-05, "loss": 0.5037, "num_tokens": 4259331028.0, "step": 1020 }, { "epoch": 2.0420736288504884, "grad_norm": 0.12164360334189708, "learning_rate": 2.8070892257343633e-05, "loss": 0.4931, "num_tokens": 4263495296.0, "step": 1021 }, { "epoch": 2.044077134986226, "grad_norm": 0.18682937406555414, "learning_rate": 2.8048889077877742e-05, "loss": 0.4968, "num_tokens": 4267677046.0, "step": 1022 }, { "epoch": 2.0460806411219634, "grad_norm": 0.150134456236596, "learning_rate": 2.8026875704413178e-05, "loss": 0.5091, "num_tokens": 4271854793.0, "step": 1023 }, { "epoch": 2.048084147257701, "grad_norm": 0.12975627443307391, "learning_rate": 2.800485217404836e-05, "loss": 0.4976, "num_tokens": 4276049097.0, "step": 1024 }, { "epoch": 2.0500876533934385, "grad_norm": 0.16853490930017914, "learning_rate": 2.798281852389884e-05, "loss": 0.5015, "num_tokens": 4280243401.0, "step": 1025 }, { "epoch": 2.052091159529176, "grad_norm": 0.15167553327907596, "learning_rate": 2.796077479109722e-05, "loss": 0.5053, "num_tokens": 4284414905.0, "step": 1026 }, { "epoch": 2.0540946656649135, "grad_norm": 0.12937521241068575, "learning_rate": 2.79387210127931e-05, "loss": 0.508, "num_tokens": 4288609209.0, "step": 1027 }, { "epoch": 2.056098171800651, "grad_norm": 0.15059920680180525, "learning_rate": 2.791665722615301e-05, "loss": 0.5229, "num_tokens": 4292757992.0, "step": 1028 }, { "epoch": 2.0581016779363885, "grad_norm": 0.15337305504358612, "learning_rate": 2.7894583468360325e-05, "loss": 0.517, "num_tokens": 4296952296.0, "step": 1029 }, { "epoch": 2.060105184072126, "grad_norm": 0.13080129053291487, "learning_rate": 2.7872499776615258e-05, "loss": 0.5235, "num_tokens": 4301109305.0, "step": 1030 }, { "epoch": 2.0621086902078636, "grad_norm": 0.12373766264443205, "learning_rate": 2.7850406188134732e-05, "loss": 0.5178, "num_tokens": 4305279263.0, "step": 1031 }, { "epoch": 2.064112196343601, "grad_norm": 0.13762420085192456, "learning_rate": 2.7828302740152376e-05, "loss": 0.5052, "num_tokens": 4309451023.0, "step": 1032 }, { "epoch": 2.0661157024793386, "grad_norm": 0.11717412710125728, "learning_rate": 2.7806189469918417e-05, "loss": 0.5142, "num_tokens": 4313634105.0, "step": 1033 }, { "epoch": 2.0681192086150766, "grad_norm": 0.11999035780607217, "learning_rate": 2.778406641469963e-05, "loss": 0.5188, "num_tokens": 4317768661.0, "step": 1034 }, { "epoch": 2.070122714750814, "grad_norm": 0.11941571026178284, "learning_rate": 2.77619336117793e-05, "loss": 0.503, "num_tokens": 4321962965.0, "step": 1035 }, { "epoch": 2.0721262208865516, "grad_norm": 0.13698358734221994, "learning_rate": 2.7739791098457138e-05, "loss": 0.4994, "num_tokens": 4326157269.0, "step": 1036 }, { "epoch": 2.074129727022289, "grad_norm": 0.12377475889834527, "learning_rate": 2.7717638912049195e-05, "loss": 0.5092, "num_tokens": 4330326025.0, "step": 1037 }, { "epoch": 2.0761332331580267, "grad_norm": 0.14319710421587703, "learning_rate": 2.769547708988785e-05, "loss": 0.4986, "num_tokens": 4334516103.0, "step": 1038 }, { "epoch": 2.078136739293764, "grad_norm": 0.1101038166331404, "learning_rate": 2.7673305669321703e-05, "loss": 0.5109, "num_tokens": 4338710407.0, "step": 1039 }, { "epoch": 2.0801402454295017, "grad_norm": 0.1487650627546696, "learning_rate": 2.7651124687715536e-05, "loss": 0.5109, "num_tokens": 4342896450.0, "step": 1040 }, { "epoch": 2.0821437515652392, "grad_norm": 0.12399504601782707, "learning_rate": 2.762893418245026e-05, "loss": 0.5077, "num_tokens": 4347090754.0, "step": 1041 }, { "epoch": 2.0841472577009768, "grad_norm": 0.12737930495371613, "learning_rate": 2.7606734190922816e-05, "loss": 0.5104, "num_tokens": 4351285058.0, "step": 1042 }, { "epoch": 2.0861507638367143, "grad_norm": 0.13514746551062862, "learning_rate": 2.7584524750546125e-05, "loss": 0.5188, "num_tokens": 4355450297.0, "step": 1043 }, { "epoch": 2.088154269972452, "grad_norm": 0.13106393645386175, "learning_rate": 2.7562305898749054e-05, "loss": 0.5044, "num_tokens": 4359644601.0, "step": 1044 }, { "epoch": 2.0901577761081893, "grad_norm": 0.12215753217185175, "learning_rate": 2.7540077672976328e-05, "loss": 0.5164, "num_tokens": 4363832816.0, "step": 1045 }, { "epoch": 2.092161282243927, "grad_norm": 0.13105721755329763, "learning_rate": 2.7517840110688455e-05, "loss": 0.5038, "num_tokens": 4368027120.0, "step": 1046 }, { "epoch": 2.0941647883796644, "grad_norm": 0.11958566192670723, "learning_rate": 2.7495593249361677e-05, "loss": 0.5001, "num_tokens": 4372221424.0, "step": 1047 }, { "epoch": 2.096168294515402, "grad_norm": 0.13389986107306656, "learning_rate": 2.7473337126487933e-05, "loss": 0.5039, "num_tokens": 4376369545.0, "step": 1048 }, { "epoch": 2.0981718006511394, "grad_norm": 0.11378279098672774, "learning_rate": 2.7451071779574738e-05, "loss": 0.514, "num_tokens": 4380535654.0, "step": 1049 }, { "epoch": 2.100175306786877, "grad_norm": 0.1361634670272564, "learning_rate": 2.742879724614518e-05, "loss": 0.5162, "num_tokens": 4384729958.0, "step": 1050 }, { "epoch": 2.1021788129226144, "grad_norm": 0.11129492390751966, "learning_rate": 2.74065135637378e-05, "loss": 0.5018, "num_tokens": 4388924262.0, "step": 1051 }, { "epoch": 2.104182319058352, "grad_norm": 0.14112817091738025, "learning_rate": 2.7384220769906585e-05, "loss": 0.5036, "num_tokens": 4393117922.0, "step": 1052 }, { "epoch": 2.1061858251940895, "grad_norm": 0.15003334032461682, "learning_rate": 2.7361918902220863e-05, "loss": 0.5159, "num_tokens": 4397312226.0, "step": 1053 }, { "epoch": 2.108189331329827, "grad_norm": 0.14006707284449546, "learning_rate": 2.733960799826525e-05, "loss": 0.525, "num_tokens": 4401496723.0, "step": 1054 }, { "epoch": 2.110192837465565, "grad_norm": 0.15171649938733245, "learning_rate": 2.7317288095639605e-05, "loss": 0.5023, "num_tokens": 4405691027.0, "step": 1055 }, { "epoch": 2.1121963436013025, "grad_norm": 0.13896847862966186, "learning_rate": 2.7294959231958936e-05, "loss": 0.5011, "num_tokens": 4409835613.0, "step": 1056 }, { "epoch": 2.11419984973704, "grad_norm": 0.12629463301938515, "learning_rate": 2.727262144485337e-05, "loss": 0.498, "num_tokens": 4414001789.0, "step": 1057 }, { "epoch": 2.1162033558727775, "grad_norm": 0.13437172816006113, "learning_rate": 2.725027477196805e-05, "loss": 0.5103, "num_tokens": 4418190343.0, "step": 1058 }, { "epoch": 2.118206862008515, "grad_norm": 0.1391842014374285, "learning_rate": 2.722791925096312e-05, "loss": 0.5082, "num_tokens": 4422384647.0, "step": 1059 }, { "epoch": 2.1202103681442526, "grad_norm": 0.16110842900610578, "learning_rate": 2.7205554919513614e-05, "loss": 0.5245, "num_tokens": 4426573327.0, "step": 1060 }, { "epoch": 2.12221387427999, "grad_norm": 0.13075292371340733, "learning_rate": 2.7183181815309424e-05, "loss": 0.5124, "num_tokens": 4430745595.0, "step": 1061 }, { "epoch": 2.1242173804157276, "grad_norm": 0.1522685039043486, "learning_rate": 2.7160799976055225e-05, "loss": 0.5136, "num_tokens": 4434939899.0, "step": 1062 }, { "epoch": 2.126220886551465, "grad_norm": 0.12306482083673362, "learning_rate": 2.713840943947042e-05, "loss": 0.5117, "num_tokens": 4439123572.0, "step": 1063 }, { "epoch": 2.1282243926872026, "grad_norm": 0.1346851729656409, "learning_rate": 2.7116010243289045e-05, "loss": 0.5154, "num_tokens": 4443317876.0, "step": 1064 }, { "epoch": 2.13022789882294, "grad_norm": 0.13295966446148672, "learning_rate": 2.709360242525976e-05, "loss": 0.5031, "num_tokens": 4447454219.0, "step": 1065 }, { "epoch": 2.1322314049586777, "grad_norm": 0.12025535016711084, "learning_rate": 2.707118602314574e-05, "loss": 0.5339, "num_tokens": 4451629680.0, "step": 1066 }, { "epoch": 2.134234911094415, "grad_norm": 0.13224344637329627, "learning_rate": 2.7048761074724624e-05, "loss": 0.5008, "num_tokens": 4455823984.0, "step": 1067 }, { "epoch": 2.1362384172301527, "grad_norm": 0.11252297459074583, "learning_rate": 2.7026327617788453e-05, "loss": 0.5187, "num_tokens": 4460008365.0, "step": 1068 }, { "epoch": 2.1382419233658903, "grad_norm": 0.1684583066948467, "learning_rate": 2.700388569014363e-05, "loss": 0.5066, "num_tokens": 4464183998.0, "step": 1069 }, { "epoch": 2.1402454295016278, "grad_norm": 0.12210461838722121, "learning_rate": 2.6981435329610804e-05, "loss": 0.5207, "num_tokens": 4468378302.0, "step": 1070 }, { "epoch": 2.1422489356373653, "grad_norm": 0.14414457447495443, "learning_rate": 2.695897657402484e-05, "loss": 0.5124, "num_tokens": 4472531805.0, "step": 1071 }, { "epoch": 2.144252441773103, "grad_norm": 0.11669649760185893, "learning_rate": 2.693650946123478e-05, "loss": 0.5175, "num_tokens": 4476715812.0, "step": 1072 }, { "epoch": 2.1462559479088403, "grad_norm": 0.12047580576905578, "learning_rate": 2.691403402910371e-05, "loss": 0.5203, "num_tokens": 4480888565.0, "step": 1073 }, { "epoch": 2.148259454044578, "grad_norm": 0.10632176246488181, "learning_rate": 2.6891550315508753e-05, "loss": 0.4995, "num_tokens": 4485077750.0, "step": 1074 }, { "epoch": 2.1502629601803154, "grad_norm": 0.14710257212197791, "learning_rate": 2.6869058358340997e-05, "loss": 0.5121, "num_tokens": 4489259577.0, "step": 1075 }, { "epoch": 2.152266466316053, "grad_norm": 0.11455166760231306, "learning_rate": 2.6846558195505426e-05, "loss": 0.506, "num_tokens": 4493453881.0, "step": 1076 }, { "epoch": 2.1542699724517904, "grad_norm": 0.11245785220090929, "learning_rate": 2.6824049864920826e-05, "loss": 0.5206, "num_tokens": 4497648185.0, "step": 1077 }, { "epoch": 2.1562734785875284, "grad_norm": 0.11112712507384144, "learning_rate": 2.680153340451977e-05, "loss": 0.5101, "num_tokens": 4501842489.0, "step": 1078 }, { "epoch": 2.158276984723266, "grad_norm": 0.1153197031327148, "learning_rate": 2.6779008852248526e-05, "loss": 0.5139, "num_tokens": 4506036793.0, "step": 1079 }, { "epoch": 2.1602804908590034, "grad_norm": 0.11939266458117807, "learning_rate": 2.6756476246066996e-05, "loss": 0.4979, "num_tokens": 4510231097.0, "step": 1080 }, { "epoch": 2.162283996994741, "grad_norm": 0.12287942180738022, "learning_rate": 2.6733935623948668e-05, "loss": 0.5121, "num_tokens": 4514414619.0, "step": 1081 }, { "epoch": 2.1642875031304785, "grad_norm": 0.12098914389438378, "learning_rate": 2.671138702388052e-05, "loss": 0.5072, "num_tokens": 4518588847.0, "step": 1082 }, { "epoch": 2.166291009266216, "grad_norm": 0.1334728937990205, "learning_rate": 2.668883048386299e-05, "loss": 0.5125, "num_tokens": 4522780580.0, "step": 1083 }, { "epoch": 2.1682945154019535, "grad_norm": 0.12140939708623157, "learning_rate": 2.6666266041909885e-05, "loss": 0.5065, "num_tokens": 4526921091.0, "step": 1084 }, { "epoch": 2.170298021537691, "grad_norm": 0.13349762907128124, "learning_rate": 2.6643693736048336e-05, "loss": 0.5007, "num_tokens": 4531115395.0, "step": 1085 }, { "epoch": 2.1723015276734285, "grad_norm": 0.1156832520757068, "learning_rate": 2.6621113604318737e-05, "loss": 0.5035, "num_tokens": 4535297941.0, "step": 1086 }, { "epoch": 2.174305033809166, "grad_norm": 0.13203091420549312, "learning_rate": 2.6598525684774653e-05, "loss": 0.5116, "num_tokens": 4539472852.0, "step": 1087 }, { "epoch": 2.1763085399449036, "grad_norm": 0.13331273569036461, "learning_rate": 2.6575930015482772e-05, "loss": 0.5079, "num_tokens": 4543667156.0, "step": 1088 }, { "epoch": 2.178312046080641, "grad_norm": 0.17763362248906503, "learning_rate": 2.655332663452285e-05, "loss": 0.5151, "num_tokens": 4547861460.0, "step": 1089 }, { "epoch": 2.1803155522163786, "grad_norm": 0.13294619227249, "learning_rate": 2.6530715579987655e-05, "loss": 0.509, "num_tokens": 4552055764.0, "step": 1090 }, { "epoch": 2.182319058352116, "grad_norm": 0.14692877841531785, "learning_rate": 2.6508096889982864e-05, "loss": 0.5108, "num_tokens": 4556250068.0, "step": 1091 }, { "epoch": 2.1843225644878537, "grad_norm": 0.15448195572237403, "learning_rate": 2.6485470602627025e-05, "loss": 0.5138, "num_tokens": 4560432370.0, "step": 1092 }, { "epoch": 2.186326070623591, "grad_norm": 0.13335656673046695, "learning_rate": 2.6462836756051494e-05, "loss": 0.5118, "num_tokens": 4564613663.0, "step": 1093 }, { "epoch": 2.1883295767593287, "grad_norm": 0.11433441457733289, "learning_rate": 2.6440195388400372e-05, "loss": 0.496, "num_tokens": 4568807967.0, "step": 1094 }, { "epoch": 2.1903330828950662, "grad_norm": 0.10879039620540497, "learning_rate": 2.6417546537830436e-05, "loss": 0.5074, "num_tokens": 4572947504.0, "step": 1095 }, { "epoch": 2.1923365890308038, "grad_norm": 0.14833227504956148, "learning_rate": 2.6394890242511045e-05, "loss": 0.4984, "num_tokens": 4577120369.0, "step": 1096 }, { "epoch": 2.1943400951665413, "grad_norm": 0.1332832143738668, "learning_rate": 2.6372226540624148e-05, "loss": 0.5237, "num_tokens": 4581289961.0, "step": 1097 }, { "epoch": 2.196343601302279, "grad_norm": 0.13388707465672167, "learning_rate": 2.6349555470364144e-05, "loss": 0.5099, "num_tokens": 4585484265.0, "step": 1098 }, { "epoch": 2.1983471074380168, "grad_norm": 0.13284707196455292, "learning_rate": 2.6326877069937865e-05, "loss": 0.5206, "num_tokens": 4589678569.0, "step": 1099 }, { "epoch": 2.2003506135737543, "grad_norm": 0.15880700950878696, "learning_rate": 2.6304191377564496e-05, "loss": 0.5003, "num_tokens": 4593872873.0, "step": 1100 }, { "epoch": 2.202354119709492, "grad_norm": 0.1470176271250131, "learning_rate": 2.628149843147549e-05, "loss": 0.5142, "num_tokens": 4598067177.0, "step": 1101 }, { "epoch": 2.2043576258452293, "grad_norm": 0.15339895748404958, "learning_rate": 2.6258798269914555e-05, "loss": 0.509, "num_tokens": 4602238523.0, "step": 1102 }, { "epoch": 2.206361131980967, "grad_norm": 0.13986079784175381, "learning_rate": 2.623609093113754e-05, "loss": 0.5085, "num_tokens": 4606404664.0, "step": 1103 }, { "epoch": 2.2083646381167044, "grad_norm": 0.12202884749796655, "learning_rate": 2.6213376453412393e-05, "loss": 0.5151, "num_tokens": 4610586590.0, "step": 1104 }, { "epoch": 2.210368144252442, "grad_norm": 0.12818019539978556, "learning_rate": 2.6190654875019106e-05, "loss": 0.5097, "num_tokens": 4614759514.0, "step": 1105 }, { "epoch": 2.2123716503881794, "grad_norm": 0.10821494327017028, "learning_rate": 2.6167926234249606e-05, "loss": 0.5014, "num_tokens": 4618953818.0, "step": 1106 }, { "epoch": 2.214375156523917, "grad_norm": 0.1359466722876653, "learning_rate": 2.614519056940776e-05, "loss": 0.5112, "num_tokens": 4623135689.0, "step": 1107 }, { "epoch": 2.2163786626596544, "grad_norm": 0.12018115109609853, "learning_rate": 2.6122447918809246e-05, "loss": 0.5085, "num_tokens": 4627329993.0, "step": 1108 }, { "epoch": 2.218382168795392, "grad_norm": 0.13027176732005766, "learning_rate": 2.6099698320781523e-05, "loss": 0.5179, "num_tokens": 4631515409.0, "step": 1109 }, { "epoch": 2.2203856749311295, "grad_norm": 0.14840411216569088, "learning_rate": 2.6076941813663762e-05, "loss": 0.5055, "num_tokens": 4635709713.0, "step": 1110 }, { "epoch": 2.222389181066867, "grad_norm": 0.13173726671207694, "learning_rate": 2.605417843580677e-05, "loss": 0.5065, "num_tokens": 4639904017.0, "step": 1111 }, { "epoch": 2.2243926872026045, "grad_norm": 0.13658456028437851, "learning_rate": 2.6031408225572942e-05, "loss": 0.506, "num_tokens": 4644084415.0, "step": 1112 }, { "epoch": 2.226396193338342, "grad_norm": 0.12763166484359867, "learning_rate": 2.6008631221336186e-05, "loss": 0.4997, "num_tokens": 4648278719.0, "step": 1113 }, { "epoch": 2.2283996994740796, "grad_norm": 0.1188627572099341, "learning_rate": 2.5985847461481842e-05, "loss": 0.5151, "num_tokens": 4652465478.0, "step": 1114 }, { "epoch": 2.230403205609817, "grad_norm": 0.16330180996083937, "learning_rate": 2.5963056984406663e-05, "loss": 0.5009, "num_tokens": 4656659782.0, "step": 1115 }, { "epoch": 2.2324067117455546, "grad_norm": 0.12682580633575122, "learning_rate": 2.594025982851871e-05, "loss": 0.4982, "num_tokens": 4660854086.0, "step": 1116 }, { "epoch": 2.234410217881292, "grad_norm": 0.13747613700572697, "learning_rate": 2.5917456032237288e-05, "loss": 0.5112, "num_tokens": 4665039699.0, "step": 1117 }, { "epoch": 2.2364137240170296, "grad_norm": 0.10755549478786641, "learning_rate": 2.5894645633992906e-05, "loss": 0.5009, "num_tokens": 4669192737.0, "step": 1118 }, { "epoch": 2.238417230152767, "grad_norm": 0.11132109842963707, "learning_rate": 2.58718286722272e-05, "loss": 0.5118, "num_tokens": 4673363673.0, "step": 1119 }, { "epoch": 2.2404207362885047, "grad_norm": 0.12132363460682438, "learning_rate": 2.5849005185392866e-05, "loss": 0.5069, "num_tokens": 4677527731.0, "step": 1120 }, { "epoch": 2.242424242424242, "grad_norm": 0.11080547026379979, "learning_rate": 2.582617521195358e-05, "loss": 0.5196, "num_tokens": 4681717734.0, "step": 1121 }, { "epoch": 2.24442774855998, "grad_norm": 0.12065303108108917, "learning_rate": 2.580333879038398e-05, "loss": 0.5032, "num_tokens": 4685910169.0, "step": 1122 }, { "epoch": 2.2464312546957177, "grad_norm": 0.11507489476968422, "learning_rate": 2.578049595916955e-05, "loss": 0.5071, "num_tokens": 4690072969.0, "step": 1123 }, { "epoch": 2.248434760831455, "grad_norm": 0.10216833615731581, "learning_rate": 2.5757646756806572e-05, "loss": 0.5009, "num_tokens": 4694248932.0, "step": 1124 }, { "epoch": 2.2504382669671927, "grad_norm": 0.12547654464044763, "learning_rate": 2.5734791221802088e-05, "loss": 0.5193, "num_tokens": 4698390617.0, "step": 1125 }, { "epoch": 2.2524417731029303, "grad_norm": 0.12649954950697154, "learning_rate": 2.571192939267379e-05, "loss": 0.5074, "num_tokens": 4702584921.0, "step": 1126 }, { "epoch": 2.2544452792386678, "grad_norm": 0.10642399003860684, "learning_rate": 2.5689061307949983e-05, "loss": 0.5215, "num_tokens": 4706723046.0, "step": 1127 }, { "epoch": 2.2564487853744053, "grad_norm": 0.13825502267064343, "learning_rate": 2.566618700616952e-05, "loss": 0.5161, "num_tokens": 4710908811.0, "step": 1128 }, { "epoch": 2.258452291510143, "grad_norm": 0.13606293457059257, "learning_rate": 2.5643306525881722e-05, "loss": 0.528, "num_tokens": 4715103115.0, "step": 1129 }, { "epoch": 2.2604557976458803, "grad_norm": 0.11333365501593977, "learning_rate": 2.5620419905646334e-05, "loss": 0.5205, "num_tokens": 4719281869.0, "step": 1130 }, { "epoch": 2.262459303781618, "grad_norm": 0.1417617958973702, "learning_rate": 2.5597527184033446e-05, "loss": 0.5067, "num_tokens": 4723476173.0, "step": 1131 }, { "epoch": 2.2644628099173554, "grad_norm": 0.15149872917946094, "learning_rate": 2.5574628399623416e-05, "loss": 0.522, "num_tokens": 4727658963.0, "step": 1132 }, { "epoch": 2.266466316053093, "grad_norm": 0.11837116945765194, "learning_rate": 2.5551723591006842e-05, "loss": 0.5203, "num_tokens": 4731825441.0, "step": 1133 }, { "epoch": 2.2684698221888304, "grad_norm": 0.15862280206607826, "learning_rate": 2.552881279678446e-05, "loss": 0.5257, "num_tokens": 4736019745.0, "step": 1134 }, { "epoch": 2.270473328324568, "grad_norm": 0.12056009602009173, "learning_rate": 2.5505896055567094e-05, "loss": 0.5018, "num_tokens": 4740186120.0, "step": 1135 }, { "epoch": 2.2724768344603055, "grad_norm": 0.1223467132107992, "learning_rate": 2.5482973405975596e-05, "loss": 0.5079, "num_tokens": 4744364996.0, "step": 1136 }, { "epoch": 2.274480340596043, "grad_norm": 0.1392253513155448, "learning_rate": 2.5460044886640764e-05, "loss": 0.4997, "num_tokens": 4748559300.0, "step": 1137 }, { "epoch": 2.2764838467317805, "grad_norm": 0.109102181648929, "learning_rate": 2.5437110536203306e-05, "loss": 0.5107, "num_tokens": 4752753604.0, "step": 1138 }, { "epoch": 2.278487352867518, "grad_norm": 0.1415903686746479, "learning_rate": 2.541417039331374e-05, "loss": 0.5011, "num_tokens": 4756913459.0, "step": 1139 }, { "epoch": 2.2804908590032555, "grad_norm": 0.11384337475933738, "learning_rate": 2.5391224496632362e-05, "loss": 0.5049, "num_tokens": 4761076374.0, "step": 1140 }, { "epoch": 2.282494365138993, "grad_norm": 0.11893958431841452, "learning_rate": 2.536827288482914e-05, "loss": 0.5175, "num_tokens": 4765248382.0, "step": 1141 }, { "epoch": 2.2844978712747306, "grad_norm": 0.1501282827647991, "learning_rate": 2.5345315596583696e-05, "loss": 0.5254, "num_tokens": 4769442686.0, "step": 1142 }, { "epoch": 2.2865013774104685, "grad_norm": 0.11998309145579318, "learning_rate": 2.532235267058522e-05, "loss": 0.4937, "num_tokens": 4773636990.0, "step": 1143 }, { "epoch": 2.2885048835462056, "grad_norm": 0.11505765536570214, "learning_rate": 2.5299384145532383e-05, "loss": 0.5011, "num_tokens": 4777831294.0, "step": 1144 }, { "epoch": 2.2905083896819436, "grad_norm": 0.13142899040546172, "learning_rate": 2.527641006013331e-05, "loss": 0.5078, "num_tokens": 4781998206.0, "step": 1145 }, { "epoch": 2.292511895817681, "grad_norm": 0.10209925672516525, "learning_rate": 2.5253430453105486e-05, "loss": 0.5052, "num_tokens": 4786154434.0, "step": 1146 }, { "epoch": 2.2945154019534186, "grad_norm": 0.11665473357527434, "learning_rate": 2.523044536317571e-05, "loss": 0.5198, "num_tokens": 4790304652.0, "step": 1147 }, { "epoch": 2.296518908089156, "grad_norm": 0.10568359902775142, "learning_rate": 2.5207454829080012e-05, "loss": 0.5225, "num_tokens": 4794498956.0, "step": 1148 }, { "epoch": 2.2985224142248937, "grad_norm": 0.10382231512456976, "learning_rate": 2.51844588895636e-05, "loss": 0.5256, "num_tokens": 4798693260.0, "step": 1149 }, { "epoch": 2.300525920360631, "grad_norm": 0.10964993306813553, "learning_rate": 2.51614575833808e-05, "loss": 0.4963, "num_tokens": 4802854825.0, "step": 1150 }, { "epoch": 2.3025294264963687, "grad_norm": 0.12193776586599797, "learning_rate": 2.5138450949294964e-05, "loss": 0.5006, "num_tokens": 4807049129.0, "step": 1151 }, { "epoch": 2.3045329326321062, "grad_norm": 0.10757775188228107, "learning_rate": 2.511543902607845e-05, "loss": 0.5119, "num_tokens": 4811223652.0, "step": 1152 }, { "epoch": 2.3065364387678438, "grad_norm": 0.11390515042364947, "learning_rate": 2.5092421852512506e-05, "loss": 0.5046, "num_tokens": 4815406638.0, "step": 1153 }, { "epoch": 2.3085399449035813, "grad_norm": 0.11853129338662258, "learning_rate": 2.5069399467387232e-05, "loss": 0.5116, "num_tokens": 4819600942.0, "step": 1154 }, { "epoch": 2.310543451039319, "grad_norm": 0.11328720072395945, "learning_rate": 2.5046371909501517e-05, "loss": 0.5125, "num_tokens": 4823777535.0, "step": 1155 }, { "epoch": 2.3125469571750563, "grad_norm": 0.12553541795745857, "learning_rate": 2.502333921766297e-05, "loss": 0.5228, "num_tokens": 4827908023.0, "step": 1156 }, { "epoch": 2.314550463310794, "grad_norm": 0.11639565792450393, "learning_rate": 2.5000301430687842e-05, "loss": 0.5083, "num_tokens": 4832093046.0, "step": 1157 }, { "epoch": 2.3165539694465314, "grad_norm": 0.12157997587887938, "learning_rate": 2.497725858740098e-05, "loss": 0.5065, "num_tokens": 4836287350.0, "step": 1158 }, { "epoch": 2.318557475582269, "grad_norm": 0.14516909892183774, "learning_rate": 2.495421072663575e-05, "loss": 0.5176, "num_tokens": 4840481654.0, "step": 1159 }, { "epoch": 2.3205609817180064, "grad_norm": 0.10708056090909884, "learning_rate": 2.4931157887233966e-05, "loss": 0.4936, "num_tokens": 4844675958.0, "step": 1160 }, { "epoch": 2.322564487853744, "grad_norm": 0.1540968060939073, "learning_rate": 2.4908100108045842e-05, "loss": 0.5095, "num_tokens": 4848870262.0, "step": 1161 }, { "epoch": 2.3245679939894814, "grad_norm": 0.11059211149126068, "learning_rate": 2.4885037427929916e-05, "loss": 0.5179, "num_tokens": 4853064566.0, "step": 1162 }, { "epoch": 2.326571500125219, "grad_norm": 0.13711502306572, "learning_rate": 2.4861969885752984e-05, "loss": 0.5074, "num_tokens": 4857258870.0, "step": 1163 }, { "epoch": 2.3285750062609565, "grad_norm": 0.12262194325440227, "learning_rate": 2.483889752039002e-05, "loss": 0.4986, "num_tokens": 4861453174.0, "step": 1164 }, { "epoch": 2.330578512396694, "grad_norm": 0.12039922376871576, "learning_rate": 2.481582037072416e-05, "loss": 0.5016, "num_tokens": 4865647478.0, "step": 1165 }, { "epoch": 2.332582018532432, "grad_norm": 0.13434280556381176, "learning_rate": 2.4792738475646576e-05, "loss": 0.5229, "num_tokens": 4869828707.0, "step": 1166 }, { "epoch": 2.3345855246681695, "grad_norm": 0.1435230745173495, "learning_rate": 2.4769651874056443e-05, "loss": 0.5147, "num_tokens": 4874022813.0, "step": 1167 }, { "epoch": 2.336589030803907, "grad_norm": 0.13819734141015877, "learning_rate": 2.474656060486087e-05, "loss": 0.5121, "num_tokens": 4878196586.0, "step": 1168 }, { "epoch": 2.3385925369396445, "grad_norm": 0.12304812434001873, "learning_rate": 2.472346470697484e-05, "loss": 0.5153, "num_tokens": 4882353754.0, "step": 1169 }, { "epoch": 2.340596043075382, "grad_norm": 0.12781521195690468, "learning_rate": 2.4700364219321116e-05, "loss": 0.4972, "num_tokens": 4886532995.0, "step": 1170 }, { "epoch": 2.3425995492111196, "grad_norm": 0.12002933589849457, "learning_rate": 2.467725918083022e-05, "loss": 0.5025, "num_tokens": 4890727299.0, "step": 1171 }, { "epoch": 2.344603055346857, "grad_norm": 0.15393856229838773, "learning_rate": 2.4654149630440323e-05, "loss": 0.4992, "num_tokens": 4894910050.0, "step": 1172 }, { "epoch": 2.3466065614825946, "grad_norm": 0.09941388985870649, "learning_rate": 2.4631035607097215e-05, "loss": 0.496, "num_tokens": 4899089399.0, "step": 1173 }, { "epoch": 2.348610067618332, "grad_norm": 0.169968772241336, "learning_rate": 2.4607917149754217e-05, "loss": 0.5218, "num_tokens": 4903283703.0, "step": 1174 }, { "epoch": 2.3506135737540697, "grad_norm": 0.10562366676670491, "learning_rate": 2.4584794297372128e-05, "loss": 0.5067, "num_tokens": 4907460342.0, "step": 1175 }, { "epoch": 2.352617079889807, "grad_norm": 0.18031587673498425, "learning_rate": 2.456166708891914e-05, "loss": 0.5282, "num_tokens": 4911654646.0, "step": 1176 }, { "epoch": 2.3546205860255447, "grad_norm": 0.12531894542204475, "learning_rate": 2.4538535563370796e-05, "loss": 0.5071, "num_tokens": 4915840378.0, "step": 1177 }, { "epoch": 2.356624092161282, "grad_norm": 0.11547824209025036, "learning_rate": 2.4515399759709918e-05, "loss": 0.5133, "num_tokens": 4920029374.0, "step": 1178 }, { "epoch": 2.3586275982970197, "grad_norm": 0.1278175039705439, "learning_rate": 2.4492259716926536e-05, "loss": 0.5185, "num_tokens": 4924167577.0, "step": 1179 }, { "epoch": 2.3606311044327573, "grad_norm": 0.11669393635015875, "learning_rate": 2.4469115474017815e-05, "loss": 0.5171, "num_tokens": 4928361881.0, "step": 1180 }, { "epoch": 2.3626346105684948, "grad_norm": 0.12123913619582341, "learning_rate": 2.4445967069988004e-05, "loss": 0.5101, "num_tokens": 4932537801.0, "step": 1181 }, { "epoch": 2.3646381167042323, "grad_norm": 0.13051026542655342, "learning_rate": 2.4422814543848374e-05, "loss": 0.5233, "num_tokens": 4936732105.0, "step": 1182 }, { "epoch": 2.36664162283997, "grad_norm": 0.1172662866809605, "learning_rate": 2.4399657934617135e-05, "loss": 0.5087, "num_tokens": 4940912688.0, "step": 1183 }, { "epoch": 2.3686451289757073, "grad_norm": 0.12058726160212621, "learning_rate": 2.4376497281319372e-05, "loss": 0.5126, "num_tokens": 4945106992.0, "step": 1184 }, { "epoch": 2.370648635111445, "grad_norm": 0.10365711387336221, "learning_rate": 2.4353332622986983e-05, "loss": 0.5123, "num_tokens": 4949273613.0, "step": 1185 }, { "epoch": 2.3726521412471824, "grad_norm": 0.1035175567073624, "learning_rate": 2.433016399865864e-05, "loss": 0.5125, "num_tokens": 4953467917.0, "step": 1186 }, { "epoch": 2.3746556473829203, "grad_norm": 0.11437046989457895, "learning_rate": 2.4306991447379677e-05, "loss": 0.5119, "num_tokens": 4957634168.0, "step": 1187 }, { "epoch": 2.3766591535186574, "grad_norm": 0.09955059186612271, "learning_rate": 2.4283815008202037e-05, "loss": 0.5081, "num_tokens": 4961828472.0, "step": 1188 }, { "epoch": 2.3786626596543954, "grad_norm": 0.1118069080470591, "learning_rate": 2.4260634720184247e-05, "loss": 0.509, "num_tokens": 4966003813.0, "step": 1189 }, { "epoch": 2.380666165790133, "grad_norm": 0.11112030367651768, "learning_rate": 2.4237450622391298e-05, "loss": 0.5142, "num_tokens": 4970198117.0, "step": 1190 }, { "epoch": 2.3826696719258704, "grad_norm": 0.13975918417802677, "learning_rate": 2.4214262753894603e-05, "loss": 0.5081, "num_tokens": 4974366204.0, "step": 1191 }, { "epoch": 2.384673178061608, "grad_norm": 0.12310988792651997, "learning_rate": 2.4191071153771934e-05, "loss": 0.5191, "num_tokens": 4978543504.0, "step": 1192 }, { "epoch": 2.3866766841973455, "grad_norm": 0.13798247855501367, "learning_rate": 2.4167875861107356e-05, "loss": 0.5192, "num_tokens": 4982692372.0, "step": 1193 }, { "epoch": 2.388680190333083, "grad_norm": 0.11988319312894177, "learning_rate": 2.4144676914991143e-05, "loss": 0.5131, "num_tokens": 4986870076.0, "step": 1194 }, { "epoch": 2.3906836964688205, "grad_norm": 0.14869778919466292, "learning_rate": 2.4121474354519733e-05, "loss": 0.4979, "num_tokens": 4991064380.0, "step": 1195 }, { "epoch": 2.392687202604558, "grad_norm": 0.11428281159417808, "learning_rate": 2.4098268218795666e-05, "loss": 0.5014, "num_tokens": 4995252385.0, "step": 1196 }, { "epoch": 2.3946907087402955, "grad_norm": 0.13477719226481633, "learning_rate": 2.4075058546927498e-05, "loss": 0.5131, "num_tokens": 4999446689.0, "step": 1197 }, { "epoch": 2.396694214876033, "grad_norm": 0.10578782437010963, "learning_rate": 2.4051845378029732e-05, "loss": 0.5011, "num_tokens": 5003640685.0, "step": 1198 }, { "epoch": 2.3986977210117706, "grad_norm": 0.15648710614110412, "learning_rate": 2.402862875122279e-05, "loss": 0.5176, "num_tokens": 5007834989.0, "step": 1199 }, { "epoch": 2.400701227147508, "grad_norm": 0.12154262588700547, "learning_rate": 2.40054087056329e-05, "loss": 0.5064, "num_tokens": 5012029293.0, "step": 1200 }, { "epoch": 2.4027047332832456, "grad_norm": 0.14955130944758033, "learning_rate": 2.3982185280392067e-05, "loss": 0.5045, "num_tokens": 5016209531.0, "step": 1201 }, { "epoch": 2.404708239418983, "grad_norm": 0.11089422416300043, "learning_rate": 2.3958958514637978e-05, "loss": 0.5103, "num_tokens": 5020376964.0, "step": 1202 }, { "epoch": 2.4067117455547207, "grad_norm": 0.12363247765735833, "learning_rate": 2.393572844751396e-05, "loss": 0.5149, "num_tokens": 5024571268.0, "step": 1203 }, { "epoch": 2.408715251690458, "grad_norm": 0.1060635308008993, "learning_rate": 2.3912495118168894e-05, "loss": 0.5137, "num_tokens": 5028750132.0, "step": 1204 }, { "epoch": 2.4107187578261957, "grad_norm": 0.1201362737230632, "learning_rate": 2.388925856575716e-05, "loss": 0.5211, "num_tokens": 5032944436.0, "step": 1205 }, { "epoch": 2.4127222639619332, "grad_norm": 0.10910790859041365, "learning_rate": 2.3866018829438603e-05, "loss": 0.5095, "num_tokens": 5037095916.0, "step": 1206 }, { "epoch": 2.4147257700976708, "grad_norm": 0.1103629218311134, "learning_rate": 2.3842775948378372e-05, "loss": 0.5204, "num_tokens": 5041290220.0, "step": 1207 }, { "epoch": 2.4167292762334083, "grad_norm": 0.11557984873582457, "learning_rate": 2.381952996174697e-05, "loss": 0.5145, "num_tokens": 5045380582.0, "step": 1208 }, { "epoch": 2.418732782369146, "grad_norm": 0.12659196331911995, "learning_rate": 2.379628090872009e-05, "loss": 0.5075, "num_tokens": 5049562163.0, "step": 1209 }, { "epoch": 2.4207362885048838, "grad_norm": 0.11515613966114163, "learning_rate": 2.377302882847864e-05, "loss": 0.5105, "num_tokens": 5053733782.0, "step": 1210 }, { "epoch": 2.4227397946406213, "grad_norm": 0.14138806378994698, "learning_rate": 2.374977376020859e-05, "loss": 0.5243, "num_tokens": 5057928086.0, "step": 1211 }, { "epoch": 2.424743300776359, "grad_norm": 0.1442176673343124, "learning_rate": 2.3726515743100963e-05, "loss": 0.5198, "num_tokens": 5062122390.0, "step": 1212 }, { "epoch": 2.4267468069120963, "grad_norm": 0.11174001133688899, "learning_rate": 2.3703254816351752e-05, "loss": 0.5166, "num_tokens": 5066261249.0, "step": 1213 }, { "epoch": 2.428750313047834, "grad_norm": 0.17444204650869352, "learning_rate": 2.3679991019161846e-05, "loss": 0.5057, "num_tokens": 5070430305.0, "step": 1214 }, { "epoch": 2.4307538191835714, "grad_norm": 0.130180105722414, "learning_rate": 2.3656724390736985e-05, "loss": 0.4983, "num_tokens": 5074609251.0, "step": 1215 }, { "epoch": 2.432757325319309, "grad_norm": 0.1905668808302003, "learning_rate": 2.363345497028766e-05, "loss": 0.5022, "num_tokens": 5078803555.0, "step": 1216 }, { "epoch": 2.4347608314550464, "grad_norm": 0.14672304632655503, "learning_rate": 2.3610182797029086e-05, "loss": 0.5003, "num_tokens": 5082994193.0, "step": 1217 }, { "epoch": 2.436764337590784, "grad_norm": 0.15936692457233448, "learning_rate": 2.3586907910181105e-05, "loss": 0.5118, "num_tokens": 5087188497.0, "step": 1218 }, { "epoch": 2.4387678437265214, "grad_norm": 0.11968171814733611, "learning_rate": 2.356363034896814e-05, "loss": 0.5158, "num_tokens": 5091382801.0, "step": 1219 }, { "epoch": 2.440771349862259, "grad_norm": 0.172100151176835, "learning_rate": 2.354035015261912e-05, "loss": 0.5181, "num_tokens": 5095577105.0, "step": 1220 }, { "epoch": 2.4427748559979965, "grad_norm": 0.14026789839970186, "learning_rate": 2.3517067360367408e-05, "loss": 0.5195, "num_tokens": 5099771409.0, "step": 1221 }, { "epoch": 2.444778362133734, "grad_norm": 0.12289331959229967, "learning_rate": 2.3493782011450753e-05, "loss": 0.5177, "num_tokens": 5103965713.0, "step": 1222 }, { "epoch": 2.4467818682694715, "grad_norm": 0.12488734979014976, "learning_rate": 2.34704941451112e-05, "loss": 0.5008, "num_tokens": 5108147998.0, "step": 1223 }, { "epoch": 2.448785374405209, "grad_norm": 0.12597554870707448, "learning_rate": 2.3447203800595043e-05, "loss": 0.5082, "num_tokens": 5112262102.0, "step": 1224 }, { "epoch": 2.4507888805409466, "grad_norm": 0.11096503477640171, "learning_rate": 2.342391101715276e-05, "loss": 0.5127, "num_tokens": 5116456406.0, "step": 1225 }, { "epoch": 2.452792386676684, "grad_norm": 0.1180029239551979, "learning_rate": 2.3400615834038915e-05, "loss": 0.5124, "num_tokens": 5120582407.0, "step": 1226 }, { "epoch": 2.4547958928124216, "grad_norm": 0.11806375568217979, "learning_rate": 2.3377318290512155e-05, "loss": 0.5031, "num_tokens": 5124772872.0, "step": 1227 }, { "epoch": 2.456799398948159, "grad_norm": 0.10599994108722656, "learning_rate": 2.3354018425835073e-05, "loss": 0.5131, "num_tokens": 5128967176.0, "step": 1228 }, { "epoch": 2.4588029050838967, "grad_norm": 0.11294338290611046, "learning_rate": 2.3330716279274187e-05, "loss": 0.5042, "num_tokens": 5133161480.0, "step": 1229 }, { "epoch": 2.460806411219634, "grad_norm": 0.12070025241796316, "learning_rate": 2.330741189009984e-05, "loss": 0.5256, "num_tokens": 5137355784.0, "step": 1230 }, { "epoch": 2.462809917355372, "grad_norm": 0.11061125848003081, "learning_rate": 2.3284105297586185e-05, "loss": 0.5064, "num_tokens": 5141550088.0, "step": 1231 }, { "epoch": 2.464813423491109, "grad_norm": 0.13194031752222932, "learning_rate": 2.326079654101108e-05, "loss": 0.5095, "num_tokens": 5145744392.0, "step": 1232 }, { "epoch": 2.466816929626847, "grad_norm": 0.11661623729920821, "learning_rate": 2.3237485659656013e-05, "loss": 0.5185, "num_tokens": 5149914209.0, "step": 1233 }, { "epoch": 2.4688204357625847, "grad_norm": 0.14129484685169613, "learning_rate": 2.3214172692806068e-05, "loss": 0.5107, "num_tokens": 5154050447.0, "step": 1234 }, { "epoch": 2.470823941898322, "grad_norm": 0.12147222626435472, "learning_rate": 2.3190857679749836e-05, "loss": 0.5146, "num_tokens": 5158244751.0, "step": 1235 }, { "epoch": 2.4728274480340597, "grad_norm": 0.13534143454436817, "learning_rate": 2.3167540659779374e-05, "loss": 0.5017, "num_tokens": 5162439055.0, "step": 1236 }, { "epoch": 2.4748309541697973, "grad_norm": 0.12750245762255694, "learning_rate": 2.3144221672190102e-05, "loss": 0.5025, "num_tokens": 5166633359.0, "step": 1237 }, { "epoch": 2.476834460305535, "grad_norm": 0.11393506229764633, "learning_rate": 2.3120900756280754e-05, "loss": 0.508, "num_tokens": 5170827663.0, "step": 1238 }, { "epoch": 2.4788379664412723, "grad_norm": 0.119425329294816, "learning_rate": 2.3097577951353326e-05, "loss": 0.5133, "num_tokens": 5175021967.0, "step": 1239 }, { "epoch": 2.48084147257701, "grad_norm": 0.11464933798108423, "learning_rate": 2.3074253296712987e-05, "loss": 0.5178, "num_tokens": 5179216271.0, "step": 1240 }, { "epoch": 2.4828449787127473, "grad_norm": 0.1118236825322537, "learning_rate": 2.3050926831668045e-05, "loss": 0.5126, "num_tokens": 5183410575.0, "step": 1241 }, { "epoch": 2.484848484848485, "grad_norm": 0.12174608964918018, "learning_rate": 2.3027598595529828e-05, "loss": 0.5221, "num_tokens": 5187604879.0, "step": 1242 }, { "epoch": 2.4868519909842224, "grad_norm": 0.10291179754922306, "learning_rate": 2.300426862761267e-05, "loss": 0.5042, "num_tokens": 5191776990.0, "step": 1243 }, { "epoch": 2.48885549711996, "grad_norm": 0.10954129227804725, "learning_rate": 2.2980936967233815e-05, "loss": 0.4939, "num_tokens": 5195954147.0, "step": 1244 }, { "epoch": 2.4908590032556974, "grad_norm": 0.10457815198278579, "learning_rate": 2.2957603653713364e-05, "loss": 0.4997, "num_tokens": 5200129214.0, "step": 1245 }, { "epoch": 2.492862509391435, "grad_norm": 0.11112775852860098, "learning_rate": 2.29342687263742e-05, "loss": 0.5206, "num_tokens": 5204316689.0, "step": 1246 }, { "epoch": 2.4948660155271725, "grad_norm": 0.10226202895972457, "learning_rate": 2.291093222454193e-05, "loss": 0.5087, "num_tokens": 5208507067.0, "step": 1247 }, { "epoch": 2.49686952166291, "grad_norm": 0.11661000099553731, "learning_rate": 2.2887594187544805e-05, "loss": 0.5212, "num_tokens": 5212691244.0, "step": 1248 }, { "epoch": 2.4988730277986475, "grad_norm": 0.10648768916435576, "learning_rate": 2.2864254654713674e-05, "loss": 0.5217, "num_tokens": 5216859453.0, "step": 1249 }, { "epoch": 2.500876533934385, "grad_norm": 0.11862728901567039, "learning_rate": 2.2840913665381913e-05, "loss": 0.5016, "num_tokens": 5221053757.0, "step": 1250 }, { "epoch": 2.5028800400701225, "grad_norm": 0.11372552439863855, "learning_rate": 2.281757125888532e-05, "loss": 0.5063, "num_tokens": 5225248061.0, "step": 1251 }, { "epoch": 2.5048835462058605, "grad_norm": 0.13500667707521258, "learning_rate": 2.2794227474562125e-05, "loss": 0.5161, "num_tokens": 5229442365.0, "step": 1252 }, { "epoch": 2.5068870523415976, "grad_norm": 0.12239199532268445, "learning_rate": 2.2770882351752847e-05, "loss": 0.5139, "num_tokens": 5233636669.0, "step": 1253 }, { "epoch": 2.5088905584773356, "grad_norm": 0.14059243636574992, "learning_rate": 2.2747535929800278e-05, "loss": 0.5099, "num_tokens": 5237830107.0, "step": 1254 }, { "epoch": 2.5108940646130726, "grad_norm": 0.1257279302728729, "learning_rate": 2.272418824804939e-05, "loss": 0.5129, "num_tokens": 5241994965.0, "step": 1255 }, { "epoch": 2.5128975707488106, "grad_norm": 0.11682219125549713, "learning_rate": 2.270083934584728e-05, "loss": 0.5066, "num_tokens": 5246189269.0, "step": 1256 }, { "epoch": 2.514901076884548, "grad_norm": 0.11722032883130534, "learning_rate": 2.2677489262543115e-05, "loss": 0.4926, "num_tokens": 5250383573.0, "step": 1257 }, { "epoch": 2.5169045830202856, "grad_norm": 0.10956101293986102, "learning_rate": 2.2654138037488026e-05, "loss": 0.5109, "num_tokens": 5254577877.0, "step": 1258 }, { "epoch": 2.518908089156023, "grad_norm": 0.1226249668433567, "learning_rate": 2.2630785710035098e-05, "loss": 0.5154, "num_tokens": 5258757602.0, "step": 1259 }, { "epoch": 2.5209115952917607, "grad_norm": 0.10669601735854534, "learning_rate": 2.2607432319539243e-05, "loss": 0.5231, "num_tokens": 5262951906.0, "step": 1260 }, { "epoch": 2.522915101427498, "grad_norm": 0.11668896752092753, "learning_rate": 2.2584077905357196e-05, "loss": 0.5043, "num_tokens": 5267142166.0, "step": 1261 }, { "epoch": 2.5249186075632357, "grad_norm": 0.11055553970816659, "learning_rate": 2.2560722506847395e-05, "loss": 0.5156, "num_tokens": 5271336470.0, "step": 1262 }, { "epoch": 2.5269221136989732, "grad_norm": 0.11109116843921518, "learning_rate": 2.253736616336994e-05, "loss": 0.5128, "num_tokens": 5275530774.0, "step": 1263 }, { "epoch": 2.5289256198347108, "grad_norm": 0.11827050431528122, "learning_rate": 2.251400891428653e-05, "loss": 0.5111, "num_tokens": 5279725078.0, "step": 1264 }, { "epoch": 2.5309291259704483, "grad_norm": 0.10942613572706256, "learning_rate": 2.24906507989604e-05, "loss": 0.5102, "num_tokens": 5283880592.0, "step": 1265 }, { "epoch": 2.532932632106186, "grad_norm": 0.12260851704430448, "learning_rate": 2.246729185675621e-05, "loss": 0.4944, "num_tokens": 5288061405.0, "step": 1266 }, { "epoch": 2.5349361382419233, "grad_norm": 0.11268153855704195, "learning_rate": 2.2443932127040053e-05, "loss": 0.5323, "num_tokens": 5292255709.0, "step": 1267 }, { "epoch": 2.536939644377661, "grad_norm": 0.1200582887756453, "learning_rate": 2.2420571649179323e-05, "loss": 0.5105, "num_tokens": 5296428390.0, "step": 1268 }, { "epoch": 2.5389431505133984, "grad_norm": 0.10903642890471676, "learning_rate": 2.239721046254269e-05, "loss": 0.518, "num_tokens": 5300599506.0, "step": 1269 }, { "epoch": 2.540946656649136, "grad_norm": 0.10109261973975374, "learning_rate": 2.237384860650001e-05, "loss": 0.5241, "num_tokens": 5304793810.0, "step": 1270 }, { "epoch": 2.5429501627848734, "grad_norm": 0.10840149138315894, "learning_rate": 2.2350486120422265e-05, "loss": 0.5227, "num_tokens": 5308959927.0, "step": 1271 }, { "epoch": 2.544953668920611, "grad_norm": 0.11395005853891697, "learning_rate": 2.232712304368151e-05, "loss": 0.5132, "num_tokens": 5313126178.0, "step": 1272 }, { "epoch": 2.5469571750563484, "grad_norm": 0.12888424680100413, "learning_rate": 2.230375941565078e-05, "loss": 0.5129, "num_tokens": 5317296310.0, "step": 1273 }, { "epoch": 2.548960681192086, "grad_norm": 0.128307413847409, "learning_rate": 2.2280395275704058e-05, "loss": 0.506, "num_tokens": 5321490614.0, "step": 1274 }, { "epoch": 2.550964187327824, "grad_norm": 0.11132566524114981, "learning_rate": 2.2257030663216175e-05, "loss": 0.4961, "num_tokens": 5325656053.0, "step": 1275 }, { "epoch": 2.552967693463561, "grad_norm": 0.15312948697159529, "learning_rate": 2.2233665617562758e-05, "loss": 0.5104, "num_tokens": 5329842515.0, "step": 1276 }, { "epoch": 2.554971199599299, "grad_norm": 0.12110532570026532, "learning_rate": 2.2210300178120176e-05, "loss": 0.5139, "num_tokens": 5334028849.0, "step": 1277 }, { "epoch": 2.556974705735036, "grad_norm": 0.12142117608231191, "learning_rate": 2.218693438426545e-05, "loss": 0.4902, "num_tokens": 5338182441.0, "step": 1278 }, { "epoch": 2.558978211870774, "grad_norm": 0.11958130326176096, "learning_rate": 2.2163568275376206e-05, "loss": 0.5224, "num_tokens": 5342348231.0, "step": 1279 }, { "epoch": 2.5609817180065115, "grad_norm": 0.11786521385094274, "learning_rate": 2.2140201890830593e-05, "loss": 0.5156, "num_tokens": 5346518339.0, "step": 1280 }, { "epoch": 2.562985224142249, "grad_norm": 0.10582001585455715, "learning_rate": 2.2116835270007236e-05, "loss": 0.5109, "num_tokens": 5350712643.0, "step": 1281 }, { "epoch": 2.5649887302779866, "grad_norm": 0.1143117659703359, "learning_rate": 2.2093468452285145e-05, "loss": 0.5186, "num_tokens": 5354906947.0, "step": 1282 }, { "epoch": 2.566992236413724, "grad_norm": 0.11117714540739305, "learning_rate": 2.207010147704367e-05, "loss": 0.5067, "num_tokens": 5359101251.0, "step": 1283 }, { "epoch": 2.5689957425494616, "grad_norm": 0.10866669407811302, "learning_rate": 2.2046734383662418e-05, "loss": 0.5169, "num_tokens": 5363264892.0, "step": 1284 }, { "epoch": 2.570999248685199, "grad_norm": 0.1194300174766482, "learning_rate": 2.2023367211521216e-05, "loss": 0.5169, "num_tokens": 5367459196.0, "step": 1285 }, { "epoch": 2.5730027548209367, "grad_norm": 0.119537856347929, "learning_rate": 2.2000000000000003e-05, "loss": 0.5143, "num_tokens": 5371642924.0, "step": 1286 }, { "epoch": 2.575006260956674, "grad_norm": 0.11217934192778964, "learning_rate": 2.1976632788478793e-05, "loss": 0.5025, "num_tokens": 5375772657.0, "step": 1287 }, { "epoch": 2.5770097670924117, "grad_norm": 0.14353785851867953, "learning_rate": 2.1953265616337588e-05, "loss": 0.5149, "num_tokens": 5379942249.0, "step": 1288 }, { "epoch": 2.579013273228149, "grad_norm": 0.12655596959888318, "learning_rate": 2.1929898522956345e-05, "loss": 0.5054, "num_tokens": 5384110028.0, "step": 1289 }, { "epoch": 2.5810167793638867, "grad_norm": 0.11000425650747436, "learning_rate": 2.1906531547714867e-05, "loss": 0.5111, "num_tokens": 5388304332.0, "step": 1290 }, { "epoch": 2.5830202854996243, "grad_norm": 0.11698801935402553, "learning_rate": 2.1883164729992773e-05, "loss": 0.5086, "num_tokens": 5392472099.0, "step": 1291 }, { "epoch": 2.585023791635362, "grad_norm": 0.12050202852890222, "learning_rate": 2.185979810916941e-05, "loss": 0.5264, "num_tokens": 5396659678.0, "step": 1292 }, { "epoch": 2.5870272977710993, "grad_norm": 0.13124158067735528, "learning_rate": 2.1836431724623803e-05, "loss": 0.5205, "num_tokens": 5400853982.0, "step": 1293 }, { "epoch": 2.589030803906837, "grad_norm": 0.11912140590218205, "learning_rate": 2.1813065615734555e-05, "loss": 0.5273, "num_tokens": 5405043152.0, "step": 1294 }, { "epoch": 2.5910343100425743, "grad_norm": 0.11570998960815912, "learning_rate": 2.178969982187983e-05, "loss": 0.5055, "num_tokens": 5409237456.0, "step": 1295 }, { "epoch": 2.5930378161783123, "grad_norm": 0.15746820210951748, "learning_rate": 2.1766334382437248e-05, "loss": 0.5182, "num_tokens": 5413431760.0, "step": 1296 }, { "epoch": 2.5950413223140494, "grad_norm": 0.11884345470017804, "learning_rate": 2.1742969336783837e-05, "loss": 0.504, "num_tokens": 5417623027.0, "step": 1297 }, { "epoch": 2.5970448284497873, "grad_norm": 0.16431843338654942, "learning_rate": 2.171960472429595e-05, "loss": 0.5242, "num_tokens": 5421817331.0, "step": 1298 }, { "epoch": 2.5990483345855244, "grad_norm": 0.13304195397827182, "learning_rate": 2.1696240584349226e-05, "loss": 0.5155, "num_tokens": 5426011635.0, "step": 1299 }, { "epoch": 2.6010518407212624, "grad_norm": 0.16388507156546006, "learning_rate": 2.1672876956318495e-05, "loss": 0.5077, "num_tokens": 5430180414.0, "step": 1300 }, { "epoch": 2.603055346857, "grad_norm": 0.12771153002644664, "learning_rate": 2.164951387957774e-05, "loss": 0.5067, "num_tokens": 5434373428.0, "step": 1301 }, { "epoch": 2.6050588529927374, "grad_norm": 0.15433768617746316, "learning_rate": 2.1626151393499993e-05, "loss": 0.5077, "num_tokens": 5438567732.0, "step": 1302 }, { "epoch": 2.607062359128475, "grad_norm": 0.12459972777520227, "learning_rate": 2.1602789537457315e-05, "loss": 0.5175, "num_tokens": 5442762036.0, "step": 1303 }, { "epoch": 2.6090658652642125, "grad_norm": 0.12844881699579944, "learning_rate": 2.157942835082068e-05, "loss": 0.5012, "num_tokens": 5446924049.0, "step": 1304 }, { "epoch": 2.61106937139995, "grad_norm": 0.1363440162760816, "learning_rate": 2.1556067872959953e-05, "loss": 0.5157, "num_tokens": 5451118353.0, "step": 1305 }, { "epoch": 2.6130728775356875, "grad_norm": 0.11367392798887287, "learning_rate": 2.1532708143243798e-05, "loss": 0.4999, "num_tokens": 5455312657.0, "step": 1306 }, { "epoch": 2.615076383671425, "grad_norm": 0.14608233525418488, "learning_rate": 2.1509349201039612e-05, "loss": 0.5237, "num_tokens": 5459506961.0, "step": 1307 }, { "epoch": 2.6170798898071626, "grad_norm": 0.12745354718975574, "learning_rate": 2.1485991085713472e-05, "loss": 0.5032, "num_tokens": 5463701265.0, "step": 1308 }, { "epoch": 2.6190833959429, "grad_norm": 0.135463724485715, "learning_rate": 2.1462633836630062e-05, "loss": 0.5068, "num_tokens": 5467895569.0, "step": 1309 }, { "epoch": 2.6210869020786376, "grad_norm": 0.12808404312179722, "learning_rate": 2.143927749315261e-05, "loss": 0.524, "num_tokens": 5472066553.0, "step": 1310 }, { "epoch": 2.623090408214375, "grad_norm": 0.116133572559842, "learning_rate": 2.141592209464281e-05, "loss": 0.5045, "num_tokens": 5476260857.0, "step": 1311 }, { "epoch": 2.6250939143501126, "grad_norm": 0.1447430086171552, "learning_rate": 2.1392567680460763e-05, "loss": 0.5093, "num_tokens": 5480437029.0, "step": 1312 }, { "epoch": 2.62709742048585, "grad_norm": 0.10426857983352149, "learning_rate": 2.1369214289964914e-05, "loss": 0.5079, "num_tokens": 5484631333.0, "step": 1313 }, { "epoch": 2.6291009266215877, "grad_norm": 0.136984331418097, "learning_rate": 2.1345861962511983e-05, "loss": 0.5107, "num_tokens": 5488801048.0, "step": 1314 }, { "epoch": 2.631104432757325, "grad_norm": 0.11700900565752209, "learning_rate": 2.1322510737456895e-05, "loss": 0.5161, "num_tokens": 5492963378.0, "step": 1315 }, { "epoch": 2.6331079388930627, "grad_norm": 0.11972503538626748, "learning_rate": 2.1299160654152725e-05, "loss": 0.4997, "num_tokens": 5497157682.0, "step": 1316 }, { "epoch": 2.6351114450288002, "grad_norm": 0.11908146097842388, "learning_rate": 2.127581175195062e-05, "loss": 0.511, "num_tokens": 5501351986.0, "step": 1317 }, { "epoch": 2.6371149511645378, "grad_norm": 0.12492140901146342, "learning_rate": 2.1252464070199728e-05, "loss": 0.5132, "num_tokens": 5505491233.0, "step": 1318 }, { "epoch": 2.6391184573002757, "grad_norm": 0.1277627389987716, "learning_rate": 2.122911764824716e-05, "loss": 0.519, "num_tokens": 5509648044.0, "step": 1319 }, { "epoch": 2.641121963436013, "grad_norm": 0.12320636243683294, "learning_rate": 2.120577252543788e-05, "loss": 0.5072, "num_tokens": 5513842348.0, "step": 1320 }, { "epoch": 2.6431254695717508, "grad_norm": 0.11004800971511251, "learning_rate": 2.1182428741114688e-05, "loss": 0.5161, "num_tokens": 5518036652.0, "step": 1321 }, { "epoch": 2.645128975707488, "grad_norm": 0.13164559222984618, "learning_rate": 2.1159086334618096e-05, "loss": 0.5116, "num_tokens": 5522216489.0, "step": 1322 }, { "epoch": 2.647132481843226, "grad_norm": 0.109883851962463, "learning_rate": 2.1135745345286328e-05, "loss": 0.5114, "num_tokens": 5526394312.0, "step": 1323 }, { "epoch": 2.6491359879789633, "grad_norm": 0.11532928036669306, "learning_rate": 2.1112405812455204e-05, "loss": 0.5306, "num_tokens": 5530588616.0, "step": 1324 }, { "epoch": 2.651139494114701, "grad_norm": 0.10938682753536692, "learning_rate": 2.108906777545808e-05, "loss": 0.5205, "num_tokens": 5534780236.0, "step": 1325 }, { "epoch": 2.6531430002504384, "grad_norm": 0.12519654478830633, "learning_rate": 2.106573127362581e-05, "loss": 0.5001, "num_tokens": 5538974540.0, "step": 1326 }, { "epoch": 2.655146506386176, "grad_norm": 0.12192519612506754, "learning_rate": 2.104239634628664e-05, "loss": 0.5059, "num_tokens": 5543123795.0, "step": 1327 }, { "epoch": 2.6571500125219134, "grad_norm": 0.111343078723692, "learning_rate": 2.1019063032766188e-05, "loss": 0.5154, "num_tokens": 5547315694.0, "step": 1328 }, { "epoch": 2.659153518657651, "grad_norm": 0.11062413364070008, "learning_rate": 2.0995731372387333e-05, "loss": 0.507, "num_tokens": 5551509998.0, "step": 1329 }, { "epoch": 2.6611570247933884, "grad_norm": 0.10126531460788028, "learning_rate": 2.0972401404470174e-05, "loss": 0.5066, "num_tokens": 5555676891.0, "step": 1330 }, { "epoch": 2.663160530929126, "grad_norm": 0.1164272882059182, "learning_rate": 2.0949073168331964e-05, "loss": 0.5105, "num_tokens": 5559830715.0, "step": 1331 }, { "epoch": 2.6651640370648635, "grad_norm": 0.09623736308115381, "learning_rate": 2.0925746703287015e-05, "loss": 0.5081, "num_tokens": 5564002049.0, "step": 1332 }, { "epoch": 2.667167543200601, "grad_norm": 0.10742214436084474, "learning_rate": 2.0902422048646683e-05, "loss": 0.5066, "num_tokens": 5568196353.0, "step": 1333 }, { "epoch": 2.6691710493363385, "grad_norm": 0.09765383931325944, "learning_rate": 2.0879099243719255e-05, "loss": 0.5017, "num_tokens": 5572374281.0, "step": 1334 }, { "epoch": 2.671174555472076, "grad_norm": 0.11061638227051818, "learning_rate": 2.0855778327809903e-05, "loss": 0.506, "num_tokens": 5576568585.0, "step": 1335 }, { "epoch": 2.6731780616078136, "grad_norm": 0.10605443557725241, "learning_rate": 2.0832459340220628e-05, "loss": 0.5025, "num_tokens": 5580762889.0, "step": 1336 }, { "epoch": 2.675181567743551, "grad_norm": 0.11994940436983251, "learning_rate": 2.0809142320250162e-05, "loss": 0.5111, "num_tokens": 5584957193.0, "step": 1337 }, { "epoch": 2.6771850738792886, "grad_norm": 0.1051437267758934, "learning_rate": 2.0785827307193934e-05, "loss": 0.5198, "num_tokens": 5589141253.0, "step": 1338 }, { "epoch": 2.679188580015026, "grad_norm": 0.12182193008936477, "learning_rate": 2.0762514340343993e-05, "loss": 0.5225, "num_tokens": 5593335557.0, "step": 1339 }, { "epoch": 2.681192086150764, "grad_norm": 0.11304876355814558, "learning_rate": 2.073920345898893e-05, "loss": 0.5083, "num_tokens": 5597529861.0, "step": 1340 }, { "epoch": 2.683195592286501, "grad_norm": 0.10328921287295231, "learning_rate": 2.0715894702413825e-05, "loss": 0.525, "num_tokens": 5601703486.0, "step": 1341 }, { "epoch": 2.685199098422239, "grad_norm": 0.13136257846593508, "learning_rate": 2.069258810990017e-05, "loss": 0.5162, "num_tokens": 5605897790.0, "step": 1342 }, { "epoch": 2.687202604557976, "grad_norm": 0.09743930842946472, "learning_rate": 2.066928372072583e-05, "loss": 0.5049, "num_tokens": 5610092094.0, "step": 1343 }, { "epoch": 2.689206110693714, "grad_norm": 0.140401098817413, "learning_rate": 2.0645981574164932e-05, "loss": 0.5111, "num_tokens": 5614277392.0, "step": 1344 }, { "epoch": 2.6912096168294517, "grad_norm": 0.11969346377349653, "learning_rate": 2.0622681709487847e-05, "loss": 0.5079, "num_tokens": 5618424004.0, "step": 1345 }, { "epoch": 2.693213122965189, "grad_norm": 0.11739067916662993, "learning_rate": 2.0599384165961084e-05, "loss": 0.5233, "num_tokens": 5622618308.0, "step": 1346 }, { "epoch": 2.6952166291009267, "grad_norm": 0.10491297647789552, "learning_rate": 2.0576088982847245e-05, "loss": 0.5082, "num_tokens": 5626812612.0, "step": 1347 }, { "epoch": 2.6972201352366643, "grad_norm": 0.11255257450008822, "learning_rate": 2.0552796199404966e-05, "loss": 0.5187, "num_tokens": 5630985633.0, "step": 1348 }, { "epoch": 2.699223641372402, "grad_norm": 0.10761782044081158, "learning_rate": 2.0529505854888813e-05, "loss": 0.5111, "num_tokens": 5635179937.0, "step": 1349 }, { "epoch": 2.7012271475081393, "grad_norm": 0.12597219571964244, "learning_rate": 2.050621798854926e-05, "loss": 0.4937, "num_tokens": 5639365339.0, "step": 1350 }, { "epoch": 2.703230653643877, "grad_norm": 0.13052387575589702, "learning_rate": 2.04829326396326e-05, "loss": 0.4974, "num_tokens": 5643559643.0, "step": 1351 }, { "epoch": 2.7052341597796143, "grad_norm": 0.11669391128279333, "learning_rate": 2.0459649847380892e-05, "loss": 0.501, "num_tokens": 5647736862.0, "step": 1352 }, { "epoch": 2.707237665915352, "grad_norm": 0.1183627163507711, "learning_rate": 2.0436369651031863e-05, "loss": 0.5063, "num_tokens": 5651866959.0, "step": 1353 }, { "epoch": 2.7092411720510894, "grad_norm": 0.13795159904278945, "learning_rate": 2.0413092089818897e-05, "loss": 0.5021, "num_tokens": 5656061263.0, "step": 1354 }, { "epoch": 2.711244678186827, "grad_norm": 0.10605135622338357, "learning_rate": 2.038981720297092e-05, "loss": 0.5253, "num_tokens": 5660255567.0, "step": 1355 }, { "epoch": 2.7132481843225644, "grad_norm": 1.1772151122574397, "learning_rate": 2.0366545029712346e-05, "loss": 0.509, "num_tokens": 5664424108.0, "step": 1356 }, { "epoch": 2.715251690458302, "grad_norm": 0.20482089321370414, "learning_rate": 2.0343275609263024e-05, "loss": 0.5049, "num_tokens": 5668606979.0, "step": 1357 }, { "epoch": 2.7172551965940395, "grad_norm": 0.1114408587236563, "learning_rate": 2.032000898083816e-05, "loss": 0.5032, "num_tokens": 5672758014.0, "step": 1358 }, { "epoch": 2.719258702729777, "grad_norm": 0.19253190902948603, "learning_rate": 2.0296745183648254e-05, "loss": 0.5172, "num_tokens": 5676930875.0, "step": 1359 }, { "epoch": 2.7212622088655145, "grad_norm": 0.27160731712521163, "learning_rate": 2.0273484256899046e-05, "loss": 0.5176, "num_tokens": 5681094065.0, "step": 1360 }, { "epoch": 2.723265715001252, "grad_norm": 0.16340796667546686, "learning_rate": 2.025022623979142e-05, "loss": 0.4995, "num_tokens": 5685288369.0, "step": 1361 }, { "epoch": 2.7252692211369896, "grad_norm": 0.13928872134528492, "learning_rate": 2.0226971171521365e-05, "loss": 0.5069, "num_tokens": 5689482673.0, "step": 1362 }, { "epoch": 2.7272727272727275, "grad_norm": 0.13940288760293837, "learning_rate": 2.0203719091279912e-05, "loss": 0.5041, "num_tokens": 5693645779.0, "step": 1363 }, { "epoch": 2.7292762334084646, "grad_norm": 0.12440585790284493, "learning_rate": 2.018047003825304e-05, "loss": 0.5099, "num_tokens": 5697810833.0, "step": 1364 }, { "epoch": 2.7312797395442026, "grad_norm": 0.13019869332920783, "learning_rate": 2.0157224051621633e-05, "loss": 0.5256, "num_tokens": 5702005137.0, "step": 1365 }, { "epoch": 2.7332832456799396, "grad_norm": 0.1270725268938618, "learning_rate": 2.013398117056141e-05, "loss": 0.514, "num_tokens": 5706199441.0, "step": 1366 }, { "epoch": 2.7352867518156776, "grad_norm": 0.11806038179088302, "learning_rate": 2.011074143424284e-05, "loss": 0.5224, "num_tokens": 5710393745.0, "step": 1367 }, { "epoch": 2.737290257951415, "grad_norm": 0.13752285094583347, "learning_rate": 2.008750488183112e-05, "loss": 0.5065, "num_tokens": 5714588049.0, "step": 1368 }, { "epoch": 2.7392937640871526, "grad_norm": 0.13460682832597212, "learning_rate": 2.0064271552486057e-05, "loss": 0.506, "num_tokens": 5718782353.0, "step": 1369 }, { "epoch": 2.74129727022289, "grad_norm": 0.1471925994504961, "learning_rate": 2.0041041485362034e-05, "loss": 0.5283, "num_tokens": 5722976657.0, "step": 1370 }, { "epoch": 2.7433007763586277, "grad_norm": 0.13998563783329843, "learning_rate": 2.0017814719607938e-05, "loss": 0.4966, "num_tokens": 5727164545.0, "step": 1371 }, { "epoch": 2.745304282494365, "grad_norm": 0.1405078475703941, "learning_rate": 1.9994591294367102e-05, "loss": 0.5252, "num_tokens": 5731358849.0, "step": 1372 }, { "epoch": 2.7473077886301027, "grad_norm": 0.13605063016044083, "learning_rate": 1.9971371248777216e-05, "loss": 0.515, "num_tokens": 5735537325.0, "step": 1373 }, { "epoch": 2.7493112947658402, "grad_norm": 0.12847728568399397, "learning_rate": 1.994815462197027e-05, "loss": 0.5188, "num_tokens": 5739731629.0, "step": 1374 }, { "epoch": 2.7513148009015778, "grad_norm": 0.14008174495325382, "learning_rate": 1.992494145307251e-05, "loss": 0.5053, "num_tokens": 5743925933.0, "step": 1375 }, { "epoch": 2.7533183070373153, "grad_norm": 0.14249261508675173, "learning_rate": 1.990173178120434e-05, "loss": 0.5172, "num_tokens": 5748120237.0, "step": 1376 }, { "epoch": 2.755321813173053, "grad_norm": 0.11439115212149738, "learning_rate": 1.9878525645480273e-05, "loss": 0.5131, "num_tokens": 5752288592.0, "step": 1377 }, { "epoch": 2.7573253193087903, "grad_norm": 0.13354130174668247, "learning_rate": 1.9855323085008863e-05, "loss": 0.504, "num_tokens": 5756482896.0, "step": 1378 }, { "epoch": 2.759328825444528, "grad_norm": 0.10554157451095615, "learning_rate": 1.9832124138892653e-05, "loss": 0.509, "num_tokens": 5760677200.0, "step": 1379 }, { "epoch": 2.7613323315802654, "grad_norm": 0.11025827039607076, "learning_rate": 1.9808928846228065e-05, "loss": 0.5054, "num_tokens": 5764871504.0, "step": 1380 }, { "epoch": 2.763335837716003, "grad_norm": 0.11346936386622576, "learning_rate": 1.97857372461054e-05, "loss": 0.5143, "num_tokens": 5769037242.0, "step": 1381 }, { "epoch": 2.7653393438517404, "grad_norm": 0.11539938017681932, "learning_rate": 1.9762549377608704e-05, "loss": 0.4991, "num_tokens": 5773231546.0, "step": 1382 }, { "epoch": 2.767342849987478, "grad_norm": 0.11808278317774021, "learning_rate": 1.9739365279815756e-05, "loss": 0.5058, "num_tokens": 5777424421.0, "step": 1383 }, { "epoch": 2.769346356123216, "grad_norm": 0.10582274126310623, "learning_rate": 1.971618499179797e-05, "loss": 0.5143, "num_tokens": 5781605310.0, "step": 1384 }, { "epoch": 2.771349862258953, "grad_norm": 0.11932033367057905, "learning_rate": 1.9693008552620335e-05, "loss": 0.5213, "num_tokens": 5785787872.0, "step": 1385 }, { "epoch": 2.773353368394691, "grad_norm": 0.11357155436923984, "learning_rate": 1.9669836001341362e-05, "loss": 0.5199, "num_tokens": 5789956024.0, "step": 1386 }, { "epoch": 2.775356874530428, "grad_norm": 0.10242906941021482, "learning_rate": 1.9646667377013022e-05, "loss": 0.5166, "num_tokens": 5794150328.0, "step": 1387 }, { "epoch": 2.777360380666166, "grad_norm": 0.13073562722982504, "learning_rate": 1.962350271868064e-05, "loss": 0.504, "num_tokens": 5798344632.0, "step": 1388 }, { "epoch": 2.7793638868019035, "grad_norm": 0.11635979150247312, "learning_rate": 1.960034206538287e-05, "loss": 0.5122, "num_tokens": 5802538936.0, "step": 1389 }, { "epoch": 2.781367392937641, "grad_norm": 0.14027583793041895, "learning_rate": 1.957718545615163e-05, "loss": 0.5019, "num_tokens": 5806711229.0, "step": 1390 }, { "epoch": 2.7833708990733785, "grad_norm": 0.123834962727391, "learning_rate": 1.9554032930012e-05, "loss": 0.5172, "num_tokens": 5810905533.0, "step": 1391 }, { "epoch": 2.785374405209116, "grad_norm": 0.1361040152120771, "learning_rate": 1.953088452598219e-05, "loss": 0.5136, "num_tokens": 5815084833.0, "step": 1392 }, { "epoch": 2.7873779113448536, "grad_norm": 0.12300414984940422, "learning_rate": 1.9507740283073473e-05, "loss": 0.5164, "num_tokens": 5819279137.0, "step": 1393 }, { "epoch": 2.789381417480591, "grad_norm": 0.10227739094265845, "learning_rate": 1.948460024029009e-05, "loss": 0.5108, "num_tokens": 5823463430.0, "step": 1394 }, { "epoch": 2.7913849236163286, "grad_norm": 0.1280527672036784, "learning_rate": 1.9461464436629213e-05, "loss": 0.5014, "num_tokens": 5827640865.0, "step": 1395 }, { "epoch": 2.793388429752066, "grad_norm": 0.10301484750474411, "learning_rate": 1.9438332911080873e-05, "loss": 0.52, "num_tokens": 5831805930.0, "step": 1396 }, { "epoch": 2.7953919358878037, "grad_norm": 0.11817110205730413, "learning_rate": 1.9415205702627884e-05, "loss": 0.5132, "num_tokens": 5836000234.0, "step": 1397 }, { "epoch": 2.797395442023541, "grad_norm": 0.10731081349707138, "learning_rate": 1.9392082850245785e-05, "loss": 0.4955, "num_tokens": 5840194538.0, "step": 1398 }, { "epoch": 2.7993989481592787, "grad_norm": 0.11082153772817123, "learning_rate": 1.936896439290279e-05, "loss": 0.5084, "num_tokens": 5844369616.0, "step": 1399 }, { "epoch": 2.801402454295016, "grad_norm": 0.10642872340363324, "learning_rate": 1.934585036955968e-05, "loss": 0.5139, "num_tokens": 5848523747.0, "step": 1400 }, { "epoch": 2.8034059604307537, "grad_norm": 0.10532752243872465, "learning_rate": 1.9322740819169788e-05, "loss": 0.5158, "num_tokens": 5852718051.0, "step": 1401 }, { "epoch": 2.8054094665664913, "grad_norm": 0.10362839975973337, "learning_rate": 1.929963578067889e-05, "loss": 0.5064, "num_tokens": 5856912355.0, "step": 1402 }, { "epoch": 2.807412972702229, "grad_norm": 0.09966994640552738, "learning_rate": 1.9276535293025173e-05, "loss": 0.5022, "num_tokens": 5861106659.0, "step": 1403 }, { "epoch": 2.8094164788379663, "grad_norm": 0.09902983780795838, "learning_rate": 1.9253439395139136e-05, "loss": 0.505, "num_tokens": 5865265457.0, "step": 1404 }, { "epoch": 2.811419984973704, "grad_norm": 0.10668034085579119, "learning_rate": 1.9230348125943566e-05, "loss": 0.5218, "num_tokens": 5869459761.0, "step": 1405 }, { "epoch": 2.8134234911094413, "grad_norm": 0.107694617671516, "learning_rate": 1.9207261524353437e-05, "loss": 0.5064, "num_tokens": 5873654065.0, "step": 1406 }, { "epoch": 2.8154269972451793, "grad_norm": 0.09666637356225088, "learning_rate": 1.9184179629275845e-05, "loss": 0.5119, "num_tokens": 5877848369.0, "step": 1407 }, { "epoch": 2.8174305033809164, "grad_norm": 0.11587927962745412, "learning_rate": 1.9161102479609984e-05, "loss": 0.5041, "num_tokens": 5882042673.0, "step": 1408 }, { "epoch": 2.8194340095166543, "grad_norm": 0.11064519056376648, "learning_rate": 1.9138030114247022e-05, "loss": 0.5101, "num_tokens": 5886226230.0, "step": 1409 }, { "epoch": 2.8214375156523914, "grad_norm": 0.12953700613388783, "learning_rate": 1.911496257207009e-05, "loss": 0.5062, "num_tokens": 5890420534.0, "step": 1410 }, { "epoch": 2.8234410217881294, "grad_norm": 0.12682429152376917, "learning_rate": 1.9091899891954164e-05, "loss": 0.5069, "num_tokens": 5894614838.0, "step": 1411 }, { "epoch": 2.825444527923867, "grad_norm": 0.11698505888885781, "learning_rate": 1.9068842112766043e-05, "loss": 0.5167, "num_tokens": 5898753823.0, "step": 1412 }, { "epoch": 2.8274480340596044, "grad_norm": 0.10352003223096899, "learning_rate": 1.9045789273364257e-05, "loss": 0.5097, "num_tokens": 5902943576.0, "step": 1413 }, { "epoch": 2.829451540195342, "grad_norm": 0.12690013118315113, "learning_rate": 1.9022741412599028e-05, "loss": 0.5018, "num_tokens": 5907137880.0, "step": 1414 }, { "epoch": 2.8314550463310795, "grad_norm": 0.09445923636957397, "learning_rate": 1.8999698569312167e-05, "loss": 0.4944, "num_tokens": 5911332184.0, "step": 1415 }, { "epoch": 2.833458552466817, "grad_norm": 0.1470948164475778, "learning_rate": 1.8976660782337033e-05, "loss": 0.5346, "num_tokens": 5915526488.0, "step": 1416 }, { "epoch": 2.8354620586025545, "grad_norm": 0.08956101530842725, "learning_rate": 1.8953628090498485e-05, "loss": 0.509, "num_tokens": 5919720792.0, "step": 1417 }, { "epoch": 2.837465564738292, "grad_norm": 0.11733076584651217, "learning_rate": 1.8930600532612774e-05, "loss": 0.5097, "num_tokens": 5923889198.0, "step": 1418 }, { "epoch": 2.8394690708740296, "grad_norm": 0.10276029977838082, "learning_rate": 1.8907578147487503e-05, "loss": 0.5083, "num_tokens": 5928077253.0, "step": 1419 }, { "epoch": 2.841472577009767, "grad_norm": 0.09921736049800331, "learning_rate": 1.8884560973921555e-05, "loss": 0.5033, "num_tokens": 5932271557.0, "step": 1420 }, { "epoch": 2.8434760831455046, "grad_norm": 0.0977980721526914, "learning_rate": 1.886154905070504e-05, "loss": 0.5143, "num_tokens": 5936432504.0, "step": 1421 }, { "epoch": 2.845479589281242, "grad_norm": 0.1025868887512952, "learning_rate": 1.883854241661921e-05, "loss": 0.4961, "num_tokens": 5940626808.0, "step": 1422 }, { "epoch": 2.8474830954169796, "grad_norm": 0.09534605702596385, "learning_rate": 1.881554111043641e-05, "loss": 0.5032, "num_tokens": 5944821112.0, "step": 1423 }, { "epoch": 2.849486601552717, "grad_norm": 0.11975709781381812, "learning_rate": 1.879254517091999e-05, "loss": 0.5123, "num_tokens": 5948998465.0, "step": 1424 }, { "epoch": 2.8514901076884547, "grad_norm": 0.12348875781134352, "learning_rate": 1.8769554636824298e-05, "loss": 0.5213, "num_tokens": 5953192769.0, "step": 1425 }, { "epoch": 2.853493613824192, "grad_norm": 0.11456827501973682, "learning_rate": 1.874656954689452e-05, "loss": 0.4964, "num_tokens": 5957361050.0, "step": 1426 }, { "epoch": 2.8554971199599297, "grad_norm": 0.15231929478070652, "learning_rate": 1.8723589939866692e-05, "loss": 0.5096, "num_tokens": 5961538760.0, "step": 1427 }, { "epoch": 2.8575006260956677, "grad_norm": 0.10975346080597492, "learning_rate": 1.870061585446762e-05, "loss": 0.5195, "num_tokens": 5965727215.0, "step": 1428 }, { "epoch": 2.8595041322314048, "grad_norm": 0.11275907215085443, "learning_rate": 1.8677647329414787e-05, "loss": 0.5053, "num_tokens": 5969921519.0, "step": 1429 }, { "epoch": 2.8615076383671427, "grad_norm": 0.0951100382873407, "learning_rate": 1.8654684403416306e-05, "loss": 0.504, "num_tokens": 5974100446.0, "step": 1430 }, { "epoch": 2.86351114450288, "grad_norm": 0.09762803073275414, "learning_rate": 1.8631727115170874e-05, "loss": 0.4935, "num_tokens": 5978266781.0, "step": 1431 }, { "epoch": 2.8655146506386178, "grad_norm": 0.09436703299407966, "learning_rate": 1.860877550336765e-05, "loss": 0.4941, "num_tokens": 5982446560.0, "step": 1432 }, { "epoch": 2.8675181567743553, "grad_norm": 0.09425748295043665, "learning_rate": 1.8585829606686258e-05, "loss": 0.5054, "num_tokens": 5986640864.0, "step": 1433 }, { "epoch": 2.869521662910093, "grad_norm": 0.11328286174749813, "learning_rate": 1.8562889463796697e-05, "loss": 0.5006, "num_tokens": 5990835168.0, "step": 1434 }, { "epoch": 2.8715251690458303, "grad_norm": 0.11834162465135761, "learning_rate": 1.8539955113359238e-05, "loss": 0.4983, "num_tokens": 5995029472.0, "step": 1435 }, { "epoch": 2.873528675181568, "grad_norm": 0.09142662984054711, "learning_rate": 1.8517026594024406e-05, "loss": 0.5114, "num_tokens": 5999223776.0, "step": 1436 }, { "epoch": 2.8755321813173054, "grad_norm": 0.11049746472612873, "learning_rate": 1.8494103944432908e-05, "loss": 0.5122, "num_tokens": 6003418080.0, "step": 1437 }, { "epoch": 2.877535687453043, "grad_norm": 0.09631287440009823, "learning_rate": 1.847118720321554e-05, "loss": 0.5082, "num_tokens": 6007590984.0, "step": 1438 }, { "epoch": 2.8795391935887804, "grad_norm": 0.10657783847968465, "learning_rate": 1.844827640899316e-05, "loss": 0.523, "num_tokens": 6011785288.0, "step": 1439 }, { "epoch": 2.881542699724518, "grad_norm": 0.10223616584222879, "learning_rate": 1.842537160037659e-05, "loss": 0.4922, "num_tokens": 6015979592.0, "step": 1440 }, { "epoch": 2.8835462058602555, "grad_norm": 0.10546711776359917, "learning_rate": 1.8402472815966563e-05, "loss": 0.5138, "num_tokens": 6020142994.0, "step": 1441 }, { "epoch": 2.885549711995993, "grad_norm": 0.09430333481405795, "learning_rate": 1.8379580094353664e-05, "loss": 0.5045, "num_tokens": 6024327029.0, "step": 1442 }, { "epoch": 2.8875532181317305, "grad_norm": 0.09943819081556074, "learning_rate": 1.835669347411828e-05, "loss": 0.4968, "num_tokens": 6028501046.0, "step": 1443 }, { "epoch": 2.889556724267468, "grad_norm": 0.10312779644413352, "learning_rate": 1.833381299383049e-05, "loss": 0.5173, "num_tokens": 6032695350.0, "step": 1444 }, { "epoch": 2.8915602304032055, "grad_norm": 0.09793574195296312, "learning_rate": 1.8310938692050023e-05, "loss": 0.514, "num_tokens": 6036889654.0, "step": 1445 }, { "epoch": 2.893563736538943, "grad_norm": 0.11551747229128481, "learning_rate": 1.828807060732622e-05, "loss": 0.5103, "num_tokens": 6041083958.0, "step": 1446 }, { "epoch": 2.8955672426746806, "grad_norm": 0.10538326162400495, "learning_rate": 1.8265208778197915e-05, "loss": 0.5105, "num_tokens": 6045278262.0, "step": 1447 }, { "epoch": 2.897570748810418, "grad_norm": 0.11528528539691796, "learning_rate": 1.824235324319343e-05, "loss": 0.5249, "num_tokens": 6049472566.0, "step": 1448 }, { "epoch": 2.8995742549461556, "grad_norm": 0.11730443960511203, "learning_rate": 1.8219504040830456e-05, "loss": 0.5087, "num_tokens": 6053666870.0, "step": 1449 }, { "epoch": 2.901577761081893, "grad_norm": 0.08987486552864996, "learning_rate": 1.8196661209616024e-05, "loss": 0.5048, "num_tokens": 6057861174.0, "step": 1450 }, { "epoch": 2.903581267217631, "grad_norm": 0.1296629675101775, "learning_rate": 1.817382478804642e-05, "loss": 0.5123, "num_tokens": 6062055478.0, "step": 1451 }, { "epoch": 2.905584773353368, "grad_norm": 0.09331953223381839, "learning_rate": 1.815099481460714e-05, "loss": 0.507, "num_tokens": 6066248088.0, "step": 1452 }, { "epoch": 2.907588279489106, "grad_norm": 0.11710728812171707, "learning_rate": 1.8128171327772803e-05, "loss": 0.512, "num_tokens": 6070418967.0, "step": 1453 }, { "epoch": 2.909591785624843, "grad_norm": 0.10004784455734418, "learning_rate": 1.8105354366007103e-05, "loss": 0.4911, "num_tokens": 6074613271.0, "step": 1454 }, { "epoch": 2.911595291760581, "grad_norm": 0.11052346294328251, "learning_rate": 1.8082543967762718e-05, "loss": 0.5067, "num_tokens": 6078807575.0, "step": 1455 }, { "epoch": 2.9135987978963187, "grad_norm": 0.09462434577733318, "learning_rate": 1.8059740171481297e-05, "loss": 0.4911, "num_tokens": 6082982594.0, "step": 1456 }, { "epoch": 2.9156023040320562, "grad_norm": 0.11729151645695936, "learning_rate": 1.803694301559334e-05, "loss": 0.4947, "num_tokens": 6087154761.0, "step": 1457 }, { "epoch": 2.9176058101677937, "grad_norm": 0.10347464074993751, "learning_rate": 1.801415253851816e-05, "loss": 0.5064, "num_tokens": 6091267905.0, "step": 1458 }, { "epoch": 2.9196093163035313, "grad_norm": 0.10451161908640537, "learning_rate": 1.7991368778663823e-05, "loss": 0.513, "num_tokens": 6095462209.0, "step": 1459 }, { "epoch": 2.921612822439269, "grad_norm": 0.10672852026401743, "learning_rate": 1.796859177442706e-05, "loss": 0.4956, "num_tokens": 6099656513.0, "step": 1460 }, { "epoch": 2.9236163285750063, "grad_norm": 0.11681201749470689, "learning_rate": 1.7945821564193235e-05, "loss": 0.5076, "num_tokens": 6103850817.0, "step": 1461 }, { "epoch": 2.925619834710744, "grad_norm": 0.09479741064099942, "learning_rate": 1.792305818633624e-05, "loss": 0.5105, "num_tokens": 6108045121.0, "step": 1462 }, { "epoch": 2.9276233408464813, "grad_norm": 0.11195899318936094, "learning_rate": 1.790030167921848e-05, "loss": 0.5214, "num_tokens": 6112235043.0, "step": 1463 }, { "epoch": 2.929626846982219, "grad_norm": 0.1242304122059702, "learning_rate": 1.7877552081190763e-05, "loss": 0.4983, "num_tokens": 6116429347.0, "step": 1464 }, { "epoch": 2.9316303531179564, "grad_norm": 0.09134840421084921, "learning_rate": 1.7854809430592247e-05, "loss": 0.5133, "num_tokens": 6120612872.0, "step": 1465 }, { "epoch": 2.933633859253694, "grad_norm": 0.12728830169065886, "learning_rate": 1.7832073765750396e-05, "loss": 0.4981, "num_tokens": 6124807176.0, "step": 1466 }, { "epoch": 2.9356373653894314, "grad_norm": 0.09957590929762641, "learning_rate": 1.7809345124980906e-05, "loss": 0.4958, "num_tokens": 6129001480.0, "step": 1467 }, { "epoch": 2.937640871525169, "grad_norm": 0.11757413278744569, "learning_rate": 1.7786623546587613e-05, "loss": 0.5119, "num_tokens": 6133166520.0, "step": 1468 }, { "epoch": 2.9396443776609065, "grad_norm": 0.09256828183925019, "learning_rate": 1.776390906886246e-05, "loss": 0.506, "num_tokens": 6137347365.0, "step": 1469 }, { "epoch": 2.941647883796644, "grad_norm": 0.11243118223735324, "learning_rate": 1.7741201730085448e-05, "loss": 0.4903, "num_tokens": 6141515665.0, "step": 1470 }, { "epoch": 2.9436513899323815, "grad_norm": 0.10746693503535878, "learning_rate": 1.7718501568524512e-05, "loss": 0.5072, "num_tokens": 6145709969.0, "step": 1471 }, { "epoch": 2.9456548960681195, "grad_norm": 0.09632752465879013, "learning_rate": 1.7695808622435513e-05, "loss": 0.5029, "num_tokens": 6149865582.0, "step": 1472 }, { "epoch": 2.9476584022038566, "grad_norm": 0.0948385332250988, "learning_rate": 1.7673122930062138e-05, "loss": 0.491, "num_tokens": 6154059886.0, "step": 1473 }, { "epoch": 2.9496619083395945, "grad_norm": 0.10902497340885793, "learning_rate": 1.7650444529635858e-05, "loss": 0.5165, "num_tokens": 6158251156.0, "step": 1474 }, { "epoch": 2.9516654144753316, "grad_norm": 0.10557529337872212, "learning_rate": 1.7627773459375858e-05, "loss": 0.5002, "num_tokens": 6162439168.0, "step": 1475 }, { "epoch": 2.9536689206110696, "grad_norm": 0.0989990462436108, "learning_rate": 1.760510975748896e-05, "loss": 0.4979, "num_tokens": 6166630442.0, "step": 1476 }, { "epoch": 2.9556724267468066, "grad_norm": 0.10676227889591353, "learning_rate": 1.758245346216958e-05, "loss": 0.5026, "num_tokens": 6170824746.0, "step": 1477 }, { "epoch": 2.9576759328825446, "grad_norm": 0.09822150040963895, "learning_rate": 1.755980461159963e-05, "loss": 0.5097, "num_tokens": 6175019050.0, "step": 1478 }, { "epoch": 2.959679439018282, "grad_norm": 0.11024216415830689, "learning_rate": 1.753716324394851e-05, "loss": 0.5134, "num_tokens": 6179213354.0, "step": 1479 }, { "epoch": 2.9616829451540196, "grad_norm": 0.10373174909045459, "learning_rate": 1.751452939737298e-05, "loss": 0.5049, "num_tokens": 6183405934.0, "step": 1480 }, { "epoch": 2.963686451289757, "grad_norm": 0.09665191572632109, "learning_rate": 1.7491903110017145e-05, "loss": 0.5146, "num_tokens": 6187600238.0, "step": 1481 }, { "epoch": 2.9656899574254947, "grad_norm": 0.10576464736889378, "learning_rate": 1.7469284420012348e-05, "loss": 0.5157, "num_tokens": 6191765286.0, "step": 1482 }, { "epoch": 2.967693463561232, "grad_norm": 0.10263538452672585, "learning_rate": 1.7446673365477152e-05, "loss": 0.5015, "num_tokens": 6195959590.0, "step": 1483 }, { "epoch": 2.9696969696969697, "grad_norm": 0.09060560948963818, "learning_rate": 1.742406998451724e-05, "loss": 0.4926, "num_tokens": 6200103831.0, "step": 1484 }, { "epoch": 2.9717004758327072, "grad_norm": 0.09730356912951538, "learning_rate": 1.740147431522536e-05, "loss": 0.5245, "num_tokens": 6204298135.0, "step": 1485 }, { "epoch": 2.9737039819684448, "grad_norm": 0.10573975405556799, "learning_rate": 1.7378886395681272e-05, "loss": 0.5042, "num_tokens": 6208492439.0, "step": 1486 }, { "epoch": 2.9757074881041823, "grad_norm": 0.10918584720799017, "learning_rate": 1.735630626395166e-05, "loss": 0.506, "num_tokens": 6212686743.0, "step": 1487 }, { "epoch": 2.97771099423992, "grad_norm": 0.1110779261850766, "learning_rate": 1.7333733958090117e-05, "loss": 0.5094, "num_tokens": 6216881047.0, "step": 1488 }, { "epoch": 2.9797145003756573, "grad_norm": 0.10790803907129891, "learning_rate": 1.7311169516137016e-05, "loss": 0.5099, "num_tokens": 6221075351.0, "step": 1489 }, { "epoch": 2.981718006511395, "grad_norm": 0.131739380347691, "learning_rate": 1.7288612976119486e-05, "loss": 0.5111, "num_tokens": 6225247818.0, "step": 1490 }, { "epoch": 2.9837215126471324, "grad_norm": 0.11742481426097191, "learning_rate": 1.7266064376051338e-05, "loss": 0.5081, "num_tokens": 6229418492.0, "step": 1491 }, { "epoch": 2.98572501878287, "grad_norm": 0.12175641745448466, "learning_rate": 1.7243523753933013e-05, "loss": 0.5032, "num_tokens": 6233612796.0, "step": 1492 }, { "epoch": 2.9877285249186074, "grad_norm": 0.09875114331168916, "learning_rate": 1.7220991147751486e-05, "loss": 0.5026, "num_tokens": 6237801748.0, "step": 1493 }, { "epoch": 2.989732031054345, "grad_norm": 0.12015616491845017, "learning_rate": 1.7198466595480244e-05, "loss": 0.5105, "num_tokens": 6241983219.0, "step": 1494 }, { "epoch": 2.991735537190083, "grad_norm": 0.10628042980282702, "learning_rate": 1.7175950135079186e-05, "loss": 0.5219, "num_tokens": 6246175261.0, "step": 1495 }, { "epoch": 2.99373904332582, "grad_norm": 0.11530936880988558, "learning_rate": 1.7153441804494583e-05, "loss": 0.5251, "num_tokens": 6250369565.0, "step": 1496 }, { "epoch": 2.995742549461558, "grad_norm": 0.11352500055306707, "learning_rate": 1.7130941641659006e-05, "loss": 0.5129, "num_tokens": 6254563869.0, "step": 1497 }, { "epoch": 2.997746055597295, "grad_norm": 0.10193873864345893, "learning_rate": 1.7108449684491253e-05, "loss": 0.5091, "num_tokens": 6258758173.0, "step": 1498 }, { "epoch": 2.999749561733033, "grad_norm": 0.11671035510122342, "learning_rate": 1.70859659708963e-05, "loss": 0.5179, "num_tokens": 6262947013.0, "step": 1499 }, { "epoch": 3.0, "grad_norm": 0.11671035510122342, "learning_rate": 1.706349053876523e-05, "loss": 0.5076, "num_tokens": 6263471301.0, "step": 1500 }, { "epoch": 3.0020035061357375, "grad_norm": 0.25436540444008787, "learning_rate": 1.704102342597516e-05, "loss": 0.4834, "num_tokens": 6267657457.0, "step": 1501 }, { "epoch": 3.004007012271475, "grad_norm": 0.13653109745223993, "learning_rate": 1.7018564670389205e-05, "loss": 0.4891, "num_tokens": 6271851761.0, "step": 1502 }, { "epoch": 3.0060105184072126, "grad_norm": 0.1483985339028301, "learning_rate": 1.6996114309856378e-05, "loss": 0.4827, "num_tokens": 6276018255.0, "step": 1503 }, { "epoch": 3.00801402454295, "grad_norm": 0.11708973436000145, "learning_rate": 1.6973672382211542e-05, "loss": 0.491, "num_tokens": 6280212559.0, "step": 1504 }, { "epoch": 3.0100175306786876, "grad_norm": 0.1587456522262091, "learning_rate": 1.6951238925275385e-05, "loss": 0.492, "num_tokens": 6284389915.0, "step": 1505 }, { "epoch": 3.012021036814425, "grad_norm": 0.1404503638186555, "learning_rate": 1.6928813976854267e-05, "loss": 0.4776, "num_tokens": 6288584219.0, "step": 1506 }, { "epoch": 3.0140245429501626, "grad_norm": 0.12957599758300897, "learning_rate": 1.6906397574740247e-05, "loss": 0.4772, "num_tokens": 6292759680.0, "step": 1507 }, { "epoch": 3.0160280490859, "grad_norm": 0.14129836901437584, "learning_rate": 1.6883989756710964e-05, "loss": 0.4983, "num_tokens": 6296953984.0, "step": 1508 }, { "epoch": 3.0180315552216377, "grad_norm": 0.11007226735443389, "learning_rate": 1.6861590560529593e-05, "loss": 0.4881, "num_tokens": 6301148288.0, "step": 1509 }, { "epoch": 3.020035061357375, "grad_norm": 0.13300412660169125, "learning_rate": 1.683920002394478e-05, "loss": 0.4995, "num_tokens": 6305330834.0, "step": 1510 }, { "epoch": 3.022038567493113, "grad_norm": 0.12287703963819674, "learning_rate": 1.681681818469058e-05, "loss": 0.481, "num_tokens": 6309522108.0, "step": 1511 }, { "epoch": 3.0240420736288507, "grad_norm": 0.11062579848478901, "learning_rate": 1.679444508048639e-05, "loss": 0.479, "num_tokens": 6313716412.0, "step": 1512 }, { "epoch": 3.026045579764588, "grad_norm": 0.13254310577751685, "learning_rate": 1.677208074903688e-05, "loss": 0.4776, "num_tokens": 6317879327.0, "step": 1513 }, { "epoch": 3.0280490859003257, "grad_norm": 0.12004157861080848, "learning_rate": 1.674972522803195e-05, "loss": 0.4735, "num_tokens": 6322073631.0, "step": 1514 }, { "epoch": 3.0300525920360633, "grad_norm": 0.10486519486154021, "learning_rate": 1.672737855514664e-05, "loss": 0.5051, "num_tokens": 6326236946.0, "step": 1515 }, { "epoch": 3.0320560981718008, "grad_norm": 0.10111963164103184, "learning_rate": 1.670504076804107e-05, "loss": 0.489, "num_tokens": 6330431250.0, "step": 1516 }, { "epoch": 3.0340596043075383, "grad_norm": 0.1397245137387941, "learning_rate": 1.6682711904360404e-05, "loss": 0.48, "num_tokens": 6334625554.0, "step": 1517 }, { "epoch": 3.036063110443276, "grad_norm": 0.1080070427020935, "learning_rate": 1.6660392001734756e-05, "loss": 0.4746, "num_tokens": 6338819858.0, "step": 1518 }, { "epoch": 3.0380666165790133, "grad_norm": 0.11485512766406675, "learning_rate": 1.6638081097779143e-05, "loss": 0.475, "num_tokens": 6343014162.0, "step": 1519 }, { "epoch": 3.040070122714751, "grad_norm": 0.10997652182783815, "learning_rate": 1.6615779230093424e-05, "loss": 0.4879, "num_tokens": 6347208466.0, "step": 1520 }, { "epoch": 3.0420736288504884, "grad_norm": 0.10952912869849862, "learning_rate": 1.659348643626221e-05, "loss": 0.4822, "num_tokens": 6351401958.0, "step": 1521 }, { "epoch": 3.044077134986226, "grad_norm": 0.10252499009566256, "learning_rate": 1.657120275385483e-05, "loss": 0.4797, "num_tokens": 6355577834.0, "step": 1522 }, { "epoch": 3.0460806411219634, "grad_norm": 0.10492826624262445, "learning_rate": 1.6548928220425268e-05, "loss": 0.4819, "num_tokens": 6359708268.0, "step": 1523 }, { "epoch": 3.048084147257701, "grad_norm": 0.1302213264080914, "learning_rate": 1.652666287351208e-05, "loss": 0.4853, "num_tokens": 6363902572.0, "step": 1524 }, { "epoch": 3.0500876533934385, "grad_norm": 0.10548122987382205, "learning_rate": 1.6504406750638325e-05, "loss": 0.4875, "num_tokens": 6368096876.0, "step": 1525 }, { "epoch": 3.052091159529176, "grad_norm": 0.1155817472561886, "learning_rate": 1.6482159889311554e-05, "loss": 0.4812, "num_tokens": 6372275822.0, "step": 1526 }, { "epoch": 3.0540946656649135, "grad_norm": 0.106200357455203, "learning_rate": 1.6459922327023678e-05, "loss": 0.488, "num_tokens": 6376405277.0, "step": 1527 }, { "epoch": 3.056098171800651, "grad_norm": 0.10106660191449636, "learning_rate": 1.643769410125095e-05, "loss": 0.4852, "num_tokens": 6380564586.0, "step": 1528 }, { "epoch": 3.0581016779363885, "grad_norm": 0.11099176661821311, "learning_rate": 1.6415475249453884e-05, "loss": 0.4846, "num_tokens": 6384738251.0, "step": 1529 }, { "epoch": 3.060105184072126, "grad_norm": 0.1164171049539933, "learning_rate": 1.63932658090772e-05, "loss": 0.488, "num_tokens": 6388932555.0, "step": 1530 }, { "epoch": 3.0621086902078636, "grad_norm": 0.11200986415606574, "learning_rate": 1.6371065817549747e-05, "loss": 0.4977, "num_tokens": 6393074550.0, "step": 1531 }, { "epoch": 3.064112196343601, "grad_norm": 0.11146674048578982, "learning_rate": 1.6348875312284463e-05, "loss": 0.4852, "num_tokens": 6397222357.0, "step": 1532 }, { "epoch": 3.0661157024793386, "grad_norm": 0.11934746785098671, "learning_rate": 1.6326694330678306e-05, "loss": 0.4859, "num_tokens": 6401416661.0, "step": 1533 }, { "epoch": 3.0681192086150766, "grad_norm": 0.10131310835342502, "learning_rate": 1.6304522910112158e-05, "loss": 0.4852, "num_tokens": 6405610965.0, "step": 1534 }, { "epoch": 3.070122714750814, "grad_norm": 0.12632393870193, "learning_rate": 1.6282361087950814e-05, "loss": 0.4928, "num_tokens": 6409805269.0, "step": 1535 }, { "epoch": 3.0721262208865516, "grad_norm": 0.10645256092613863, "learning_rate": 1.6260208901542867e-05, "loss": 0.4958, "num_tokens": 6413992744.0, "step": 1536 }, { "epoch": 3.074129727022289, "grad_norm": 0.10631843423956622, "learning_rate": 1.6238066388220702e-05, "loss": 0.4837, "num_tokens": 6418187048.0, "step": 1537 }, { "epoch": 3.0761332331580267, "grad_norm": 0.1031084971231914, "learning_rate": 1.6215933585300375e-05, "loss": 0.4877, "num_tokens": 6422357861.0, "step": 1538 }, { "epoch": 3.078136739293764, "grad_norm": 0.09371364995330134, "learning_rate": 1.6193810530081595e-05, "loss": 0.4861, "num_tokens": 6426529621.0, "step": 1539 }, { "epoch": 3.0801402454295017, "grad_norm": 0.10206590000884871, "learning_rate": 1.6171697259847626e-05, "loss": 0.4822, "num_tokens": 6430723925.0, "step": 1540 }, { "epoch": 3.0821437515652392, "grad_norm": 0.09149478844734932, "learning_rate": 1.6149593811865267e-05, "loss": 0.4802, "num_tokens": 6434918229.0, "step": 1541 }, { "epoch": 3.0841472577009768, "grad_norm": 0.10256426295976978, "learning_rate": 1.6127500223384748e-05, "loss": 0.4778, "num_tokens": 6439112533.0, "step": 1542 }, { "epoch": 3.0861507638367143, "grad_norm": 0.09226373875313161, "learning_rate": 1.610541653163968e-05, "loss": 0.4911, "num_tokens": 6443273403.0, "step": 1543 }, { "epoch": 3.088154269972452, "grad_norm": 0.10167282940777077, "learning_rate": 1.6083342773847004e-05, "loss": 0.4929, "num_tokens": 6447447072.0, "step": 1544 }, { "epoch": 3.0901577761081893, "grad_norm": 0.09745418621469254, "learning_rate": 1.6061278987206906e-05, "loss": 0.4833, "num_tokens": 6451636242.0, "step": 1545 }, { "epoch": 3.092161282243927, "grad_norm": 0.09871428671962086, "learning_rate": 1.603922520890279e-05, "loss": 0.483, "num_tokens": 6455793700.0, "step": 1546 }, { "epoch": 3.0941647883796644, "grad_norm": 0.1082363315672795, "learning_rate": 1.601718147610117e-05, "loss": 0.4757, "num_tokens": 6459988004.0, "step": 1547 }, { "epoch": 3.096168294515402, "grad_norm": 0.0912095427356618, "learning_rate": 1.5995147825951653e-05, "loss": 0.4913, "num_tokens": 6464178961.0, "step": 1548 }, { "epoch": 3.0981718006511394, "grad_norm": 0.11903214965214128, "learning_rate": 1.5973124295586825e-05, "loss": 0.4838, "num_tokens": 6468373265.0, "step": 1549 }, { "epoch": 3.100175306786877, "grad_norm": 0.09018697545096202, "learning_rate": 1.5951110922122257e-05, "loss": 0.4787, "num_tokens": 6472567569.0, "step": 1550 }, { "epoch": 3.1021788129226144, "grad_norm": 0.09580861932616098, "learning_rate": 1.592910774265637e-05, "loss": 0.4928, "num_tokens": 6476723934.0, "step": 1551 }, { "epoch": 3.104182319058352, "grad_norm": 0.09084868676538305, "learning_rate": 1.590711479427042e-05, "loss": 0.4888, "num_tokens": 6480907607.0, "step": 1552 }, { "epoch": 3.1061858251940895, "grad_norm": 0.10198724657216701, "learning_rate": 1.588513211402841e-05, "loss": 0.4836, "num_tokens": 6485101911.0, "step": 1553 }, { "epoch": 3.108189331329827, "grad_norm": 0.09870022800160726, "learning_rate": 1.586315973897705e-05, "loss": 0.4889, "num_tokens": 6489296215.0, "step": 1554 }, { "epoch": 3.110192837465565, "grad_norm": 0.09323855614694225, "learning_rate": 1.5841197706145685e-05, "loss": 0.4868, "num_tokens": 6493460308.0, "step": 1555 }, { "epoch": 3.1121963436013025, "grad_norm": 0.1003007448690338, "learning_rate": 1.5819246052546216e-05, "loss": 0.4905, "num_tokens": 6497654612.0, "step": 1556 }, { "epoch": 3.11419984973704, "grad_norm": 0.10870174117225628, "learning_rate": 1.579730481517306e-05, "loss": 0.487, "num_tokens": 6501848916.0, "step": 1557 }, { "epoch": 3.1162033558727775, "grad_norm": 0.08953723311958511, "learning_rate": 1.5775374031003073e-05, "loss": 0.4804, "num_tokens": 6506043220.0, "step": 1558 }, { "epoch": 3.118206862008515, "grad_norm": 0.10555340196839853, "learning_rate": 1.5753453736995523e-05, "loss": 0.4966, "num_tokens": 6510212812.0, "step": 1559 }, { "epoch": 3.1202103681442526, "grad_norm": 0.08886030506339272, "learning_rate": 1.573154397009197e-05, "loss": 0.4855, "num_tokens": 6514407116.0, "step": 1560 }, { "epoch": 3.12221387427999, "grad_norm": 0.10520952105536499, "learning_rate": 1.5709644767216233e-05, "loss": 0.4874, "num_tokens": 6518587699.0, "step": 1561 }, { "epoch": 3.1242173804157276, "grad_norm": 0.09859023926078675, "learning_rate": 1.568775616527434e-05, "loss": 0.4948, "num_tokens": 6522770213.0, "step": 1562 }, { "epoch": 3.126220886551465, "grad_norm": 0.09915854462265092, "learning_rate": 1.5665878201154457e-05, "loss": 0.4808, "num_tokens": 6526964517.0, "step": 1563 }, { "epoch": 3.1282243926872026, "grad_norm": 0.10275613254630278, "learning_rate": 1.5644010911726805e-05, "loss": 0.4888, "num_tokens": 6531137203.0, "step": 1564 }, { "epoch": 3.13022789882294, "grad_norm": 0.08680581054667652, "learning_rate": 1.5622154333843628e-05, "loss": 0.4792, "num_tokens": 6535236446.0, "step": 1565 }, { "epoch": 3.1322314049586777, "grad_norm": 0.10730355322070294, "learning_rate": 1.560030850433912e-05, "loss": 0.4769, "num_tokens": 6539430750.0, "step": 1566 }, { "epoch": 3.134234911094415, "grad_norm": 0.10561365465661832, "learning_rate": 1.5578473460029343e-05, "loss": 0.4748, "num_tokens": 6543610775.0, "step": 1567 }, { "epoch": 3.1362384172301527, "grad_norm": 0.09300540644355397, "learning_rate": 1.5556649237712214e-05, "loss": 0.4724, "num_tokens": 6547781711.0, "step": 1568 }, { "epoch": 3.1382419233658903, "grad_norm": 0.10373362857492222, "learning_rate": 1.553483587416739e-05, "loss": 0.4875, "num_tokens": 6551976015.0, "step": 1569 }, { "epoch": 3.1402454295016278, "grad_norm": 0.10871294113069528, "learning_rate": 1.5513033406156224e-05, "loss": 0.4795, "num_tokens": 6556155601.0, "step": 1570 }, { "epoch": 3.1422489356373653, "grad_norm": 0.10497198125051285, "learning_rate": 1.5491241870421726e-05, "loss": 0.4777, "num_tokens": 6560323991.0, "step": 1571 }, { "epoch": 3.144252441773103, "grad_norm": 0.10412450323861132, "learning_rate": 1.5469461303688466e-05, "loss": 0.489, "num_tokens": 6564471930.0, "step": 1572 }, { "epoch": 3.1462559479088403, "grad_norm": 0.09295055267142482, "learning_rate": 1.5447691742662537e-05, "loss": 0.4801, "num_tokens": 6568666234.0, "step": 1573 }, { "epoch": 3.148259454044578, "grad_norm": 0.10245529125743341, "learning_rate": 1.542593322403148e-05, "loss": 0.5007, "num_tokens": 6572860538.0, "step": 1574 }, { "epoch": 3.1502629601803154, "grad_norm": 0.09440988742126073, "learning_rate": 1.5404185784464222e-05, "loss": 0.4842, "num_tokens": 6577035616.0, "step": 1575 }, { "epoch": 3.152266466316053, "grad_norm": 0.10639045903752259, "learning_rate": 1.538244946061104e-05, "loss": 0.4897, "num_tokens": 6581203825.0, "step": 1576 }, { "epoch": 3.1542699724517904, "grad_norm": 0.11479418397823195, "learning_rate": 1.536072428910345e-05, "loss": 0.4933, "num_tokens": 6585386696.0, "step": 1577 }, { "epoch": 3.1562734785875284, "grad_norm": 0.0933214949222758, "learning_rate": 1.5339010306554197e-05, "loss": 0.4841, "num_tokens": 6589560973.0, "step": 1578 }, { "epoch": 3.158276984723266, "grad_norm": 0.09330215635591968, "learning_rate": 1.5317307549557154e-05, "loss": 0.4857, "num_tokens": 6593755277.0, "step": 1579 }, { "epoch": 3.1602804908590034, "grad_norm": 0.10294953080904892, "learning_rate": 1.529561605468728e-05, "loss": 0.4903, "num_tokens": 6597911705.0, "step": 1580 }, { "epoch": 3.162283996994741, "grad_norm": 0.11827140623429327, "learning_rate": 1.5273935858500556e-05, "loss": 0.4897, "num_tokens": 6602078040.0, "step": 1581 }, { "epoch": 3.1642875031304785, "grad_norm": 0.09343884095148065, "learning_rate": 1.5252266997533917e-05, "loss": 0.4937, "num_tokens": 6606254212.0, "step": 1582 }, { "epoch": 3.166291009266216, "grad_norm": 0.0953380839415335, "learning_rate": 1.5230609508305208e-05, "loss": 0.4923, "num_tokens": 6610404335.0, "step": 1583 }, { "epoch": 3.1682945154019535, "grad_norm": 0.10022038187218706, "learning_rate": 1.5208963427313092e-05, "loss": 0.4966, "num_tokens": 6614584114.0, "step": 1584 }, { "epoch": 3.170298021537691, "grad_norm": 0.10578727001007089, "learning_rate": 1.518732879103702e-05, "loss": 0.5002, "num_tokens": 6618741894.0, "step": 1585 }, { "epoch": 3.1723015276734285, "grad_norm": 0.1030154582379365, "learning_rate": 1.5165705635937151e-05, "loss": 0.4806, "num_tokens": 6622936198.0, "step": 1586 }, { "epoch": 3.174305033809166, "grad_norm": 0.10369153665191842, "learning_rate": 1.5144093998454295e-05, "loss": 0.4797, "num_tokens": 6627113551.0, "step": 1587 }, { "epoch": 3.1763085399449036, "grad_norm": 0.08642269060838062, "learning_rate": 1.5122493915009849e-05, "loss": 0.4586, "num_tokens": 6631263601.0, "step": 1588 }, { "epoch": 3.178312046080641, "grad_norm": 0.09781797322360798, "learning_rate": 1.510090542200573e-05, "loss": 0.4817, "num_tokens": 6635422518.0, "step": 1589 }, { "epoch": 3.1803155522163786, "grad_norm": 0.0929122032211724, "learning_rate": 1.5079328555824351e-05, "loss": 0.4821, "num_tokens": 6639601397.0, "step": 1590 }, { "epoch": 3.182319058352116, "grad_norm": 0.0926337198621452, "learning_rate": 1.5057763352828505e-05, "loss": 0.4873, "num_tokens": 6643795701.0, "step": 1591 }, { "epoch": 3.1843225644878537, "grad_norm": 0.10426789451610576, "learning_rate": 1.5036209849361334e-05, "loss": 0.4804, "num_tokens": 6647990005.0, "step": 1592 }, { "epoch": 3.186326070623591, "grad_norm": 0.10446659662915064, "learning_rate": 1.5014668081746267e-05, "loss": 0.4837, "num_tokens": 6652183665.0, "step": 1593 }, { "epoch": 3.1883295767593287, "grad_norm": 0.10023232533953363, "learning_rate": 1.4993138086286956e-05, "loss": 0.4971, "num_tokens": 6656377969.0, "step": 1594 }, { "epoch": 3.1903330828950662, "grad_norm": 0.10197951345672505, "learning_rate": 1.4971619899267209e-05, "loss": 0.4848, "num_tokens": 6660572273.0, "step": 1595 }, { "epoch": 3.1923365890308038, "grad_norm": 0.11359496187259657, "learning_rate": 1.4950113556950937e-05, "loss": 0.4836, "num_tokens": 6664758247.0, "step": 1596 }, { "epoch": 3.1943400951665413, "grad_norm": 0.09583259962280964, "learning_rate": 1.4928619095582088e-05, "loss": 0.478, "num_tokens": 6668952551.0, "step": 1597 }, { "epoch": 3.196343601302279, "grad_norm": 0.12093133527411223, "learning_rate": 1.4907136551384596e-05, "loss": 0.4842, "num_tokens": 6673146855.0, "step": 1598 }, { "epoch": 3.1983471074380168, "grad_norm": 0.10033030320299961, "learning_rate": 1.4885665960562294e-05, "loss": 0.4861, "num_tokens": 6677311026.0, "step": 1599 }, { "epoch": 3.2003506135737543, "grad_norm": 0.1136336655895272, "learning_rate": 1.4864207359298877e-05, "loss": 0.4789, "num_tokens": 6681505330.0, "step": 1600 }, { "epoch": 3.202354119709492, "grad_norm": 0.12012941245186369, "learning_rate": 1.484276078375785e-05, "loss": 0.485, "num_tokens": 6685669870.0, "step": 1601 }, { "epoch": 3.2043576258452293, "grad_norm": 0.12124268880054556, "learning_rate": 1.4821326270082423e-05, "loss": 0.4899, "num_tokens": 6689864174.0, "step": 1602 }, { "epoch": 3.206361131980967, "grad_norm": 0.13447615644624014, "learning_rate": 1.47999038543955e-05, "loss": 0.4869, "num_tokens": 6694058478.0, "step": 1603 }, { "epoch": 3.2083646381167044, "grad_norm": 0.11398735543606431, "learning_rate": 1.4778493572799603e-05, "loss": 0.4789, "num_tokens": 6698252782.0, "step": 1604 }, { "epoch": 3.210368144252442, "grad_norm": 0.10981456455063823, "learning_rate": 1.475709546137678e-05, "loss": 0.4895, "num_tokens": 6702418108.0, "step": 1605 }, { "epoch": 3.2123716503881794, "grad_norm": 0.10285726476843621, "learning_rate": 1.4735709556188584e-05, "loss": 0.4765, "num_tokens": 6706582411.0, "step": 1606 }, { "epoch": 3.214375156523917, "grad_norm": 0.09897129053050209, "learning_rate": 1.4714335893275994e-05, "loss": 0.4815, "num_tokens": 6710776715.0, "step": 1607 }, { "epoch": 3.2163786626596544, "grad_norm": 0.09585206586721122, "learning_rate": 1.4692974508659364e-05, "loss": 0.4798, "num_tokens": 6714971019.0, "step": 1608 }, { "epoch": 3.218382168795392, "grad_norm": 0.09413197995991085, "learning_rate": 1.4671625438338341e-05, "loss": 0.4935, "num_tokens": 6719144644.0, "step": 1609 }, { "epoch": 3.2203856749311295, "grad_norm": 0.11048804465710084, "learning_rate": 1.4650288718291833e-05, "loss": 0.4838, "num_tokens": 6723325457.0, "step": 1610 }, { "epoch": 3.222389181066867, "grad_norm": 0.09376531078617592, "learning_rate": 1.4628964384477926e-05, "loss": 0.488, "num_tokens": 6727519761.0, "step": 1611 }, { "epoch": 3.2243926872026045, "grad_norm": 0.10242146302241659, "learning_rate": 1.4607652472833845e-05, "loss": 0.468, "num_tokens": 6731714065.0, "step": 1612 }, { "epoch": 3.226396193338342, "grad_norm": 0.10892510758761742, "learning_rate": 1.4586353019275862e-05, "loss": 0.4904, "num_tokens": 6735908369.0, "step": 1613 }, { "epoch": 3.2283996994740796, "grad_norm": 0.10402878495806943, "learning_rate": 1.4565066059699273e-05, "loss": 0.488, "num_tokens": 6740089654.0, "step": 1614 }, { "epoch": 3.230403205609817, "grad_norm": 0.09681857421518025, "learning_rate": 1.4543791629978295e-05, "loss": 0.4878, "num_tokens": 6744283958.0, "step": 1615 }, { "epoch": 3.2324067117455546, "grad_norm": 0.09398116976038719, "learning_rate": 1.4522529765966048e-05, "loss": 0.4936, "num_tokens": 6748469571.0, "step": 1616 }, { "epoch": 3.234410217881292, "grad_norm": 0.10307707838115306, "learning_rate": 1.4501280503494468e-05, "loss": 0.4862, "num_tokens": 6752663875.0, "step": 1617 }, { "epoch": 3.2364137240170296, "grad_norm": 0.09766902785004968, "learning_rate": 1.4480043878374255e-05, "loss": 0.4876, "num_tokens": 6756855022.0, "step": 1618 }, { "epoch": 3.238417230152767, "grad_norm": 0.10323842078147412, "learning_rate": 1.4458819926394818e-05, "loss": 0.4862, "num_tokens": 6761034604.0, "step": 1619 }, { "epoch": 3.2404207362885047, "grad_norm": 0.09265596253727953, "learning_rate": 1.4437608683324192e-05, "loss": 0.4769, "num_tokens": 6765228908.0, "step": 1620 }, { "epoch": 3.242424242424242, "grad_norm": 0.10072058777083931, "learning_rate": 1.4416410184909007e-05, "loss": 0.4918, "num_tokens": 6769423212.0, "step": 1621 }, { "epoch": 3.24442774855998, "grad_norm": 0.10425659423759581, "learning_rate": 1.4395224466874425e-05, "loss": 0.4816, "num_tokens": 6773617516.0, "step": 1622 }, { "epoch": 3.2464312546957177, "grad_norm": 0.10114031119878114, "learning_rate": 1.4374051564924036e-05, "loss": 0.4729, "num_tokens": 6777787694.0, "step": 1623 }, { "epoch": 3.248434760831455, "grad_norm": 0.09768419766200721, "learning_rate": 1.4352891514739864e-05, "loss": 0.485, "num_tokens": 6781963614.0, "step": 1624 }, { "epoch": 3.2504382669671927, "grad_norm": 0.11200449736337391, "learning_rate": 1.433174435198227e-05, "loss": 0.4848, "num_tokens": 6786130439.0, "step": 1625 }, { "epoch": 3.2524417731029303, "grad_norm": 0.1051279393420735, "learning_rate": 1.4310610112289873e-05, "loss": 0.5068, "num_tokens": 6790324743.0, "step": 1626 }, { "epoch": 3.2544452792386678, "grad_norm": 0.11279999024137691, "learning_rate": 1.4289488831279545e-05, "loss": 0.4675, "num_tokens": 6794507878.0, "step": 1627 }, { "epoch": 3.2564487853744053, "grad_norm": 0.10034565510025444, "learning_rate": 1.4268380544546285e-05, "loss": 0.4845, "num_tokens": 6798670192.0, "step": 1628 }, { "epoch": 3.258452291510143, "grad_norm": 0.11656558668116104, "learning_rate": 1.424728528766322e-05, "loss": 0.4786, "num_tokens": 6802864496.0, "step": 1629 }, { "epoch": 3.2604557976458803, "grad_norm": 0.11939673472716772, "learning_rate": 1.4226203096181519e-05, "loss": 0.4847, "num_tokens": 6807037374.0, "step": 1630 }, { "epoch": 3.262459303781618, "grad_norm": 0.108678574909612, "learning_rate": 1.420513400563031e-05, "loss": 0.4817, "num_tokens": 6811221102.0, "step": 1631 }, { "epoch": 3.2644628099173554, "grad_norm": 0.10761725674981082, "learning_rate": 1.4184078051516656e-05, "loss": 0.476, "num_tokens": 6815415406.0, "step": 1632 }, { "epoch": 3.266466316053093, "grad_norm": 0.1031497072175544, "learning_rate": 1.4163035269325493e-05, "loss": 0.4697, "num_tokens": 6819609710.0, "step": 1633 }, { "epoch": 3.2684698221888304, "grad_norm": 0.1117867900674895, "learning_rate": 1.4142005694519539e-05, "loss": 0.4747, "num_tokens": 6823745217.0, "step": 1634 }, { "epoch": 3.270473328324568, "grad_norm": 0.10327648200500404, "learning_rate": 1.4120989362539259e-05, "loss": 0.4885, "num_tokens": 6827920350.0, "step": 1635 }, { "epoch": 3.2724768344603055, "grad_norm": 0.0939653767448184, "learning_rate": 1.40999863088028e-05, "loss": 0.4789, "num_tokens": 6832101339.0, "step": 1636 }, { "epoch": 3.274480340596043, "grad_norm": 0.09788505252456416, "learning_rate": 1.407899656870595e-05, "loss": 0.4717, "num_tokens": 6836295643.0, "step": 1637 }, { "epoch": 3.2764838467317805, "grad_norm": 0.09172809545476189, "learning_rate": 1.405802017762201e-05, "loss": 0.5018, "num_tokens": 6840458686.0, "step": 1638 }, { "epoch": 3.278487352867518, "grad_norm": 0.0922312437652333, "learning_rate": 1.4037057170901869e-05, "loss": 0.4879, "num_tokens": 6844637562.0, "step": 1639 }, { "epoch": 3.2804908590032555, "grad_norm": 0.10040358645439756, "learning_rate": 1.4016107583873773e-05, "loss": 0.4873, "num_tokens": 6848825141.0, "step": 1640 }, { "epoch": 3.282494365138993, "grad_norm": 0.09337317338062798, "learning_rate": 1.3995171451843404e-05, "loss": 0.4806, "num_tokens": 6852987064.0, "step": 1641 }, { "epoch": 3.2844978712747306, "grad_norm": 0.10538510780959512, "learning_rate": 1.3974248810093744e-05, "loss": 0.479, "num_tokens": 6857181368.0, "step": 1642 }, { "epoch": 3.2865013774104685, "grad_norm": 0.08717577472610284, "learning_rate": 1.395333969388506e-05, "loss": 0.4835, "num_tokens": 6861348729.0, "step": 1643 }, { "epoch": 3.2885048835462056, "grad_norm": 0.1033491859290238, "learning_rate": 1.3932444138454802e-05, "loss": 0.4812, "num_tokens": 6865534732.0, "step": 1644 }, { "epoch": 3.2905083896819436, "grad_norm": 0.11846867402220332, "learning_rate": 1.3911562179017596e-05, "loss": 0.4872, "num_tokens": 6869723187.0, "step": 1645 }, { "epoch": 3.292511895817681, "grad_norm": 0.08286660708445097, "learning_rate": 1.389069385076512e-05, "loss": 0.4814, "num_tokens": 6873917491.0, "step": 1646 }, { "epoch": 3.2945154019534186, "grad_norm": 0.10844408352251825, "learning_rate": 1.3869839188866108e-05, "loss": 0.4858, "num_tokens": 6878101528.0, "step": 1647 }, { "epoch": 3.296518908089156, "grad_norm": 0.1031092647784647, "learning_rate": 1.3848998228466253e-05, "loss": 0.4906, "num_tokens": 6882295832.0, "step": 1648 }, { "epoch": 3.2985224142248937, "grad_norm": 0.09472507467099817, "learning_rate": 1.382817100468816e-05, "loss": 0.4727, "num_tokens": 6886490136.0, "step": 1649 }, { "epoch": 3.300525920360631, "grad_norm": 0.10733408481041874, "learning_rate": 1.3807357552631271e-05, "loss": 0.4857, "num_tokens": 6890684440.0, "step": 1650 }, { "epoch": 3.3025294264963687, "grad_norm": 0.11614862832251582, "learning_rate": 1.378655790737184e-05, "loss": 0.4826, "num_tokens": 6894878744.0, "step": 1651 }, { "epoch": 3.3045329326321062, "grad_norm": 0.10607676047924837, "learning_rate": 1.376577210396283e-05, "loss": 0.4887, "num_tokens": 6899056594.0, "step": 1652 }, { "epoch": 3.3065364387678438, "grad_norm": 0.10721248340350749, "learning_rate": 1.3745000177433902e-05, "loss": 0.5033, "num_tokens": 6903250898.0, "step": 1653 }, { "epoch": 3.3085399449035813, "grad_norm": 0.103306754823588, "learning_rate": 1.3724242162791299e-05, "loss": 0.4716, "num_tokens": 6907445202.0, "step": 1654 }, { "epoch": 3.310543451039319, "grad_norm": 0.097793245173385, "learning_rate": 1.370349809501786e-05, "loss": 0.4756, "num_tokens": 6911625774.0, "step": 1655 }, { "epoch": 3.3125469571750563, "grad_norm": 0.11729234902116642, "learning_rate": 1.368276800907287e-05, "loss": 0.4874, "num_tokens": 6915815073.0, "step": 1656 }, { "epoch": 3.314550463310794, "grad_norm": 0.09434097065708066, "learning_rate": 1.3662051939892106e-05, "loss": 0.4829, "num_tokens": 6920009377.0, "step": 1657 }, { "epoch": 3.3165539694465314, "grad_norm": 0.11545617391896969, "learning_rate": 1.364134992238767e-05, "loss": 0.4904, "num_tokens": 6924198784.0, "step": 1658 }, { "epoch": 3.318557475582269, "grad_norm": 0.0938636942211511, "learning_rate": 1.362066199144803e-05, "loss": 0.4921, "num_tokens": 6928346396.0, "step": 1659 }, { "epoch": 3.3205609817180064, "grad_norm": 0.10413588159258323, "learning_rate": 1.3599988181937875e-05, "loss": 0.4823, "num_tokens": 6932540700.0, "step": 1660 }, { "epoch": 3.322564487853744, "grad_norm": 0.10173161808164839, "learning_rate": 1.3579328528698125e-05, "loss": 0.4945, "num_tokens": 6936713561.0, "step": 1661 }, { "epoch": 3.3245679939894814, "grad_norm": 0.10974963261072708, "learning_rate": 1.3558683066545818e-05, "loss": 0.4846, "num_tokens": 6940907865.0, "step": 1662 }, { "epoch": 3.326571500125219, "grad_norm": 0.09003885944051922, "learning_rate": 1.3538051830274103e-05, "loss": 0.4957, "num_tokens": 6945102169.0, "step": 1663 }, { "epoch": 3.3285750062609565, "grad_norm": 0.1140244744072672, "learning_rate": 1.3517434854652126e-05, "loss": 0.4783, "num_tokens": 6949294211.0, "step": 1664 }, { "epoch": 3.330578512396694, "grad_norm": 0.09750975411829227, "learning_rate": 1.3496832174425023e-05, "loss": 0.481, "num_tokens": 6953488515.0, "step": 1665 }, { "epoch": 3.332582018532432, "grad_norm": 0.09284186972586805, "learning_rate": 1.3476243824313829e-05, "loss": 0.4839, "num_tokens": 6957682819.0, "step": 1666 }, { "epoch": 3.3345855246681695, "grad_norm": 0.11087324484198335, "learning_rate": 1.3455669839015436e-05, "loss": 0.4933, "num_tokens": 6961862733.0, "step": 1667 }, { "epoch": 3.336589030803907, "grad_norm": 0.10523781441428721, "learning_rate": 1.3435110253202505e-05, "loss": 0.4836, "num_tokens": 6966057037.0, "step": 1668 }, { "epoch": 3.3385925369396445, "grad_norm": 0.09614623264496773, "learning_rate": 1.3414565101523466e-05, "loss": 0.4842, "num_tokens": 6970245042.0, "step": 1669 }, { "epoch": 3.340596043075382, "grad_norm": 0.10400040587387493, "learning_rate": 1.3394034418602389e-05, "loss": 0.4951, "num_tokens": 6974429423.0, "step": 1670 }, { "epoch": 3.3425995492111196, "grad_norm": 0.10311435224839889, "learning_rate": 1.3373518239038985e-05, "loss": 0.4801, "num_tokens": 6978623727.0, "step": 1671 }, { "epoch": 3.344603055346857, "grad_norm": 0.09211777096194068, "learning_rate": 1.3353016597408502e-05, "loss": 0.4813, "num_tokens": 6982818031.0, "step": 1672 }, { "epoch": 3.3466065614825946, "grad_norm": 0.10900767611366198, "learning_rate": 1.3332529528261712e-05, "loss": 0.4811, "num_tokens": 6986988707.0, "step": 1673 }, { "epoch": 3.348610067618332, "grad_norm": 0.09266906995145983, "learning_rate": 1.3312057066124796e-05, "loss": 0.485, "num_tokens": 6991156110.0, "step": 1674 }, { "epoch": 3.3506135737540697, "grad_norm": 0.10042749816545121, "learning_rate": 1.3291599245499365e-05, "loss": 0.5012, "num_tokens": 6995350414.0, "step": 1675 }, { "epoch": 3.352617079889807, "grad_norm": 0.08784758012128703, "learning_rate": 1.3271156100862302e-05, "loss": 0.4687, "num_tokens": 6999544718.0, "step": 1676 }, { "epoch": 3.3546205860255447, "grad_norm": 0.09842030830128319, "learning_rate": 1.3250727666665803e-05, "loss": 0.5029, "num_tokens": 7003713586.0, "step": 1677 }, { "epoch": 3.356624092161282, "grad_norm": 0.09223489234832635, "learning_rate": 1.3230313977337235e-05, "loss": 0.4762, "num_tokens": 7007907890.0, "step": 1678 }, { "epoch": 3.3586275982970197, "grad_norm": 0.09790114672524979, "learning_rate": 1.320991506727915e-05, "loss": 0.4862, "num_tokens": 7012072955.0, "step": 1679 }, { "epoch": 3.3606311044327573, "grad_norm": 0.08844768693881276, "learning_rate": 1.3189530970869164e-05, "loss": 0.4715, "num_tokens": 7016254184.0, "step": 1680 }, { "epoch": 3.3626346105684948, "grad_norm": 0.09871224705381766, "learning_rate": 1.3169161722459945e-05, "loss": 0.4797, "num_tokens": 7020448488.0, "step": 1681 }, { "epoch": 3.3646381167042323, "grad_norm": 0.10466173346906912, "learning_rate": 1.314880735637913e-05, "loss": 0.4838, "num_tokens": 7024642792.0, "step": 1682 }, { "epoch": 3.36664162283997, "grad_norm": 0.11898891288884104, "learning_rate": 1.3128467906929294e-05, "loss": 0.4884, "num_tokens": 7028837096.0, "step": 1683 }, { "epoch": 3.3686451289757073, "grad_norm": 0.11315668354409876, "learning_rate": 1.3108143408387838e-05, "loss": 0.4737, "num_tokens": 7033004838.0, "step": 1684 }, { "epoch": 3.370648635111445, "grad_norm": 0.09536047256806474, "learning_rate": 1.3087833895007007e-05, "loss": 0.4811, "num_tokens": 7037199142.0, "step": 1685 }, { "epoch": 3.3726521412471824, "grad_norm": 0.12811373862160255, "learning_rate": 1.306753940101375e-05, "loss": 0.4906, "num_tokens": 7041367442.0, "step": 1686 }, { "epoch": 3.3746556473829203, "grad_norm": 0.09545025324434521, "learning_rate": 1.3047259960609753e-05, "loss": 0.4764, "num_tokens": 7045561746.0, "step": 1687 }, { "epoch": 3.3766591535186574, "grad_norm": 0.09843073250482177, "learning_rate": 1.3026995607971281e-05, "loss": 0.4986, "num_tokens": 7049756050.0, "step": 1688 }, { "epoch": 3.3786626596543954, "grad_norm": 0.09864461509825509, "learning_rate": 1.3006746377249211e-05, "loss": 0.4856, "num_tokens": 7053925351.0, "step": 1689 }, { "epoch": 3.380666165790133, "grad_norm": 0.092247760073484, "learning_rate": 1.2986512302568923e-05, "loss": 0.4869, "num_tokens": 7058119655.0, "step": 1690 }, { "epoch": 3.3826696719258704, "grad_norm": 0.09059523151733419, "learning_rate": 1.2966293418030245e-05, "loss": 0.4733, "num_tokens": 7062313959.0, "step": 1691 }, { "epoch": 3.384673178061608, "grad_norm": 0.09289119271599056, "learning_rate": 1.2946089757707417e-05, "loss": 0.4785, "num_tokens": 7066494511.0, "step": 1692 }, { "epoch": 3.3866766841973455, "grad_norm": 0.10685901254953178, "learning_rate": 1.2925901355649032e-05, "loss": 0.486, "num_tokens": 7070632287.0, "step": 1693 }, { "epoch": 3.388680190333083, "grad_norm": 0.08971953907950084, "learning_rate": 1.2905728245877936e-05, "loss": 0.4838, "num_tokens": 7074824722.0, "step": 1694 }, { "epoch": 3.3906836964688205, "grad_norm": 0.10615967787201541, "learning_rate": 1.2885570462391237e-05, "loss": 0.4692, "num_tokens": 7078985772.0, "step": 1695 }, { "epoch": 3.392687202604558, "grad_norm": 0.1036461719308151, "learning_rate": 1.2865428039160199e-05, "loss": 0.4918, "num_tokens": 7083180076.0, "step": 1696 }, { "epoch": 3.3946907087402955, "grad_norm": 0.0995485791775303, "learning_rate": 1.28453010101302e-05, "loss": 0.4774, "num_tokens": 7087374380.0, "step": 1697 }, { "epoch": 3.396694214876033, "grad_norm": 0.11017338336298825, "learning_rate": 1.282518940922068e-05, "loss": 0.4752, "num_tokens": 7091540631.0, "step": 1698 }, { "epoch": 3.3986977210117706, "grad_norm": 0.09552090671906655, "learning_rate": 1.2805093270325064e-05, "loss": 0.4889, "num_tokens": 7095734935.0, "step": 1699 }, { "epoch": 3.400701227147508, "grad_norm": 0.09733515137397332, "learning_rate": 1.2785012627310737e-05, "loss": 0.4919, "num_tokens": 7099929239.0, "step": 1700 }, { "epoch": 3.4027047332832456, "grad_norm": 0.0977103823091474, "learning_rate": 1.2764947514018963e-05, "loss": 0.485, "num_tokens": 7104072341.0, "step": 1701 }, { "epoch": 3.404708239418983, "grad_norm": 0.09174996993654637, "learning_rate": 1.2744897964264844e-05, "loss": 0.4835, "num_tokens": 7108266645.0, "step": 1702 }, { "epoch": 3.4067117455547207, "grad_norm": 0.09557119752186087, "learning_rate": 1.2724864011837222e-05, "loss": 0.4778, "num_tokens": 7112460949.0, "step": 1703 }, { "epoch": 3.408715251690458, "grad_norm": 0.0867703120339142, "learning_rate": 1.2704845690498688e-05, "loss": 0.4826, "num_tokens": 7116640692.0, "step": 1704 }, { "epoch": 3.4107187578261957, "grad_norm": 0.08822770338204816, "learning_rate": 1.268484303398547e-05, "loss": 0.4765, "num_tokens": 7120783290.0, "step": 1705 }, { "epoch": 3.4127222639619332, "grad_norm": 0.08515551768855922, "learning_rate": 1.2664856076007413e-05, "loss": 0.4881, "num_tokens": 7124977594.0, "step": 1706 }, { "epoch": 3.4147257700976708, "grad_norm": 0.08979736474798362, "learning_rate": 1.264488485024788e-05, "loss": 0.4976, "num_tokens": 7129171898.0, "step": 1707 }, { "epoch": 3.4167292762334083, "grad_norm": 0.09912033790670617, "learning_rate": 1.2624929390363754e-05, "loss": 0.4847, "num_tokens": 7133338739.0, "step": 1708 }, { "epoch": 3.418732782369146, "grad_norm": 0.09651959719721198, "learning_rate": 1.260498972998531e-05, "loss": 0.482, "num_tokens": 7137516449.0, "step": 1709 }, { "epoch": 3.4207362885048838, "grad_norm": 0.09568933571881624, "learning_rate": 1.2585065902716244e-05, "loss": 0.4798, "num_tokens": 7141692024.0, "step": 1710 }, { "epoch": 3.4227397946406213, "grad_norm": 0.104025129932623, "learning_rate": 1.2565157942133526e-05, "loss": 0.4814, "num_tokens": 7145871454.0, "step": 1711 }, { "epoch": 3.424743300776359, "grad_norm": 0.10003787087554167, "learning_rate": 1.2545265881787411e-05, "loss": 0.4962, "num_tokens": 7150065758.0, "step": 1712 }, { "epoch": 3.4267468069120963, "grad_norm": 0.10156204229439354, "learning_rate": 1.2525389755201342e-05, "loss": 0.4916, "num_tokens": 7154237869.0, "step": 1713 }, { "epoch": 3.428750313047834, "grad_norm": 0.10071759549123686, "learning_rate": 1.2505529595871929e-05, "loss": 0.4891, "num_tokens": 7158413120.0, "step": 1714 }, { "epoch": 3.4307538191835714, "grad_norm": 0.0955521011104108, "learning_rate": 1.2485685437268847e-05, "loss": 0.4784, "num_tokens": 7162564086.0, "step": 1715 }, { "epoch": 3.432757325319309, "grad_norm": 0.08911666653863433, "learning_rate": 1.2465857312834832e-05, "loss": 0.4928, "num_tokens": 7166751974.0, "step": 1716 }, { "epoch": 3.4347608314550464, "grad_norm": 0.10259832409949861, "learning_rate": 1.2446045255985575e-05, "loss": 0.4897, "num_tokens": 7170946278.0, "step": 1717 }, { "epoch": 3.436764337590784, "grad_norm": 0.09661084190294426, "learning_rate": 1.2426249300109708e-05, "loss": 0.4942, "num_tokens": 7175140582.0, "step": 1718 }, { "epoch": 3.4387678437265214, "grad_norm": 0.09003518880800164, "learning_rate": 1.2406469478568712e-05, "loss": 0.4779, "num_tokens": 7179334886.0, "step": 1719 }, { "epoch": 3.440771349862259, "grad_norm": 0.09354305982292273, "learning_rate": 1.23867058246969e-05, "loss": 0.4884, "num_tokens": 7183510791.0, "step": 1720 }, { "epoch": 3.4427748559979965, "grad_norm": 0.09076305763170546, "learning_rate": 1.2366958371801311e-05, "loss": 0.4812, "num_tokens": 7187705095.0, "step": 1721 }, { "epoch": 3.444778362133734, "grad_norm": 0.12253708490861866, "learning_rate": 1.2347227153161705e-05, "loss": 0.4992, "num_tokens": 7191899399.0, "step": 1722 }, { "epoch": 3.4467818682694715, "grad_norm": 0.09905223988011931, "learning_rate": 1.2327512202030456e-05, "loss": 0.4799, "num_tokens": 7196093703.0, "step": 1723 }, { "epoch": 3.448785374405209, "grad_norm": 0.09284517685770134, "learning_rate": 1.2307813551632553e-05, "loss": 0.4864, "num_tokens": 7200276785.0, "step": 1724 }, { "epoch": 3.4507888805409466, "grad_norm": 0.10696133656431021, "learning_rate": 1.228813123516549e-05, "loss": 0.4987, "num_tokens": 7204419010.0, "step": 1725 }, { "epoch": 3.452792386676684, "grad_norm": 0.0969164662411756, "learning_rate": 1.226846528579925e-05, "loss": 0.4836, "num_tokens": 7208569874.0, "step": 1726 }, { "epoch": 3.4547958928124216, "grad_norm": 0.09782399014563874, "learning_rate": 1.2248815736676214e-05, "loss": 0.4951, "num_tokens": 7212759249.0, "step": 1727 }, { "epoch": 3.456799398948159, "grad_norm": 0.09593257167514571, "learning_rate": 1.2229182620911158e-05, "loss": 0.4817, "num_tokens": 7216914908.0, "step": 1728 }, { "epoch": 3.4588029050838967, "grad_norm": 0.09204718369958917, "learning_rate": 1.2209565971591123e-05, "loss": 0.4818, "num_tokens": 7221109212.0, "step": 1729 }, { "epoch": 3.460806411219634, "grad_norm": 0.1058407779839112, "learning_rate": 1.2189965821775434e-05, "loss": 0.4928, "num_tokens": 7225284845.0, "step": 1730 }, { "epoch": 3.462809917355372, "grad_norm": 0.093483114353475, "learning_rate": 1.2170382204495578e-05, "loss": 0.4883, "num_tokens": 7229479149.0, "step": 1731 }, { "epoch": 3.464813423491109, "grad_norm": 0.09381120661200178, "learning_rate": 1.2150815152755211e-05, "loss": 0.4991, "num_tokens": 7233656584.0, "step": 1732 }, { "epoch": 3.466816929626847, "grad_norm": 0.09359666223262765, "learning_rate": 1.213126469953005e-05, "loss": 0.4802, "num_tokens": 7237791433.0, "step": 1733 }, { "epoch": 3.4688204357625847, "grad_norm": 0.10686621430269298, "learning_rate": 1.2111730877767851e-05, "loss": 0.4809, "num_tokens": 7241934858.0, "step": 1734 }, { "epoch": 3.470823941898322, "grad_norm": 0.10119088849456247, "learning_rate": 1.209221372038833e-05, "loss": 0.4797, "num_tokens": 7246129162.0, "step": 1735 }, { "epoch": 3.4728274480340597, "grad_norm": 0.09868934444002672, "learning_rate": 1.207271326028313e-05, "loss": 0.4842, "num_tokens": 7250323466.0, "step": 1736 }, { "epoch": 3.4748309541697973, "grad_norm": 0.10004668571502116, "learning_rate": 1.2053229530315754e-05, "loss": 0.4833, "num_tokens": 7254512462.0, "step": 1737 }, { "epoch": 3.476834460305535, "grad_norm": 0.09355902077368872, "learning_rate": 1.2033762563321507e-05, "loss": 0.4892, "num_tokens": 7258706766.0, "step": 1738 }, { "epoch": 3.4788379664412723, "grad_norm": 0.09137153113946912, "learning_rate": 1.2014312392107437e-05, "loss": 0.4977, "num_tokens": 7262872205.0, "step": 1739 }, { "epoch": 3.48084147257701, "grad_norm": 0.09973477392884442, "learning_rate": 1.1994879049452304e-05, "loss": 0.4863, "num_tokens": 7267066509.0, "step": 1740 }, { "epoch": 3.4828449787127473, "grad_norm": 0.09372568795197327, "learning_rate": 1.1975462568106482e-05, "loss": 0.4698, "num_tokens": 7271260813.0, "step": 1741 }, { "epoch": 3.484848484848485, "grad_norm": 0.0873706690701727, "learning_rate": 1.195606298079196e-05, "loss": 0.4919, "num_tokens": 7275431397.0, "step": 1742 }, { "epoch": 3.4868519909842224, "grad_norm": 0.08931638840753034, "learning_rate": 1.1936680320202228e-05, "loss": 0.4895, "num_tokens": 7279623017.0, "step": 1743 }, { "epoch": 3.48885549711996, "grad_norm": 0.10075876426224503, "learning_rate": 1.1917314619002264e-05, "loss": 0.4749, "num_tokens": 7283817321.0, "step": 1744 }, { "epoch": 3.4908590032556974, "grad_norm": 0.08591605373466196, "learning_rate": 1.1897965909828468e-05, "loss": 0.4911, "num_tokens": 7288011625.0, "step": 1745 }, { "epoch": 3.492862509391435, "grad_norm": 0.09593064695888373, "learning_rate": 1.1878634225288602e-05, "loss": 0.4722, "num_tokens": 7292124273.0, "step": 1746 }, { "epoch": 3.4948660155271725, "grad_norm": 0.08899702425411736, "learning_rate": 1.185931959796172e-05, "loss": 0.4885, "num_tokens": 7296301232.0, "step": 1747 }, { "epoch": 3.49686952166291, "grad_norm": 0.11454348899671649, "learning_rate": 1.1840022060398159e-05, "loss": 0.5016, "num_tokens": 7300463318.0, "step": 1748 }, { "epoch": 3.4988730277986475, "grad_norm": 0.0896181012899833, "learning_rate": 1.1820741645119426e-05, "loss": 0.4892, "num_tokens": 7304657622.0, "step": 1749 }, { "epoch": 3.500876533934385, "grad_norm": 0.08594791506841977, "learning_rate": 1.1801478384618202e-05, "loss": 0.4711, "num_tokens": 7308838511.0, "step": 1750 }, { "epoch": 3.5028800400701225, "grad_norm": 0.10008227265116675, "learning_rate": 1.1782232311358222e-05, "loss": 0.4925, "num_tokens": 7313032815.0, "step": 1751 }, { "epoch": 3.5048835462058605, "grad_norm": 0.0882294014034115, "learning_rate": 1.176300345777429e-05, "loss": 0.4859, "num_tokens": 7317227119.0, "step": 1752 }, { "epoch": 3.5068870523415976, "grad_norm": 0.09957628317782037, "learning_rate": 1.1743791856272176e-05, "loss": 0.489, "num_tokens": 7321412430.0, "step": 1753 }, { "epoch": 3.5088905584773356, "grad_norm": 0.1009075553873941, "learning_rate": 1.1724597539228567e-05, "loss": 0.4844, "num_tokens": 7325606734.0, "step": 1754 }, { "epoch": 3.5108940646130726, "grad_norm": 0.09029341023882001, "learning_rate": 1.1705420538991033e-05, "loss": 0.4778, "num_tokens": 7329800461.0, "step": 1755 }, { "epoch": 3.5128975707488106, "grad_norm": 0.09105957399872086, "learning_rate": 1.1686260887877968e-05, "loss": 0.4835, "num_tokens": 7333994765.0, "step": 1756 }, { "epoch": 3.514901076884548, "grad_norm": 0.11787960196795937, "learning_rate": 1.1667118618178499e-05, "loss": 0.4902, "num_tokens": 7338137192.0, "step": 1757 }, { "epoch": 3.5169045830202856, "grad_norm": 0.10184643566693198, "learning_rate": 1.1647993762152492e-05, "loss": 0.4985, "num_tokens": 7342331496.0, "step": 1758 }, { "epoch": 3.518908089156023, "grad_norm": 0.09812163220503443, "learning_rate": 1.162888635203045e-05, "loss": 0.4831, "num_tokens": 7346496354.0, "step": 1759 }, { "epoch": 3.5209115952917607, "grad_norm": 0.10554280251607871, "learning_rate": 1.1609796420013475e-05, "loss": 0.4904, "num_tokens": 7350690658.0, "step": 1760 }, { "epoch": 3.522915101427498, "grad_norm": 0.09243108366542714, "learning_rate": 1.1590723998273226e-05, "loss": 0.4853, "num_tokens": 7354879516.0, "step": 1761 }, { "epoch": 3.5249186075632357, "grad_norm": 0.09146320580807932, "learning_rate": 1.1571669118951827e-05, "loss": 0.475, "num_tokens": 7359027489.0, "step": 1762 }, { "epoch": 3.5269221136989732, "grad_norm": 0.10031837017604386, "learning_rate": 1.1552631814161867e-05, "loss": 0.474, "num_tokens": 7363221793.0, "step": 1763 }, { "epoch": 3.5289256198347108, "grad_norm": 0.09241706061142829, "learning_rate": 1.153361211598631e-05, "loss": 0.4827, "num_tokens": 7367399012.0, "step": 1764 }, { "epoch": 3.5309291259704483, "grad_norm": 0.0956279189011527, "learning_rate": 1.1514610056478428e-05, "loss": 0.4829, "num_tokens": 7371593316.0, "step": 1765 }, { "epoch": 3.532932632106186, "grad_norm": 0.08700697496658412, "learning_rate": 1.149562566766179e-05, "loss": 0.487, "num_tokens": 7375762908.0, "step": 1766 }, { "epoch": 3.5349361382419233, "grad_norm": 0.10407532659190609, "learning_rate": 1.1476658981530182e-05, "loss": 0.473, "num_tokens": 7379948356.0, "step": 1767 }, { "epoch": 3.536939644377661, "grad_norm": 0.08977612047891045, "learning_rate": 1.1457710030047545e-05, "loss": 0.4794, "num_tokens": 7384136962.0, "step": 1768 }, { "epoch": 3.5389431505133984, "grad_norm": 0.0853535911987569, "learning_rate": 1.1438778845147943e-05, "loss": 0.4892, "num_tokens": 7388331266.0, "step": 1769 }, { "epoch": 3.540946656649136, "grad_norm": 0.0910169027522694, "learning_rate": 1.1419865458735488e-05, "loss": 0.4761, "num_tokens": 7392525570.0, "step": 1770 }, { "epoch": 3.5429501627848734, "grad_norm": 0.09466968477606422, "learning_rate": 1.1400969902684303e-05, "loss": 0.4886, "num_tokens": 7396719874.0, "step": 1771 }, { "epoch": 3.544953668920611, "grad_norm": 0.08465483406622323, "learning_rate": 1.138209220883847e-05, "loss": 0.4862, "num_tokens": 7400914178.0, "step": 1772 }, { "epoch": 3.5469571750563484, "grad_norm": 0.09000502667396995, "learning_rate": 1.1363232409011967e-05, "loss": 0.4999, "num_tokens": 7405108482.0, "step": 1773 }, { "epoch": 3.548960681192086, "grad_norm": 0.09106800265301557, "learning_rate": 1.1344390534988588e-05, "loss": 0.4799, "num_tokens": 7409290087.0, "step": 1774 }, { "epoch": 3.550964187327824, "grad_norm": 0.09293601471131345, "learning_rate": 1.132556661852196e-05, "loss": 0.4981, "num_tokens": 7413459580.0, "step": 1775 }, { "epoch": 3.552967693463561, "grad_norm": 0.09004499082346004, "learning_rate": 1.130676069133541e-05, "loss": 0.4857, "num_tokens": 7417653884.0, "step": 1776 }, { "epoch": 3.554971199599299, "grad_norm": 0.08944772329274465, "learning_rate": 1.1287972785121975e-05, "loss": 0.4854, "num_tokens": 7421848188.0, "step": 1777 }, { "epoch": 3.556974705735036, "grad_norm": 0.1064639053724264, "learning_rate": 1.12692029315443e-05, "loss": 0.4862, "num_tokens": 7426042492.0, "step": 1778 }, { "epoch": 3.558978211870774, "grad_norm": 0.09172956929553808, "learning_rate": 1.125045116223463e-05, "loss": 0.4876, "num_tokens": 7430236796.0, "step": 1779 }, { "epoch": 3.5609817180065115, "grad_norm": 0.09711466991119759, "learning_rate": 1.1231717508794696e-05, "loss": 0.4801, "num_tokens": 7434431100.0, "step": 1780 }, { "epoch": 3.562985224142249, "grad_norm": 0.1169974453066529, "learning_rate": 1.1213002002795757e-05, "loss": 0.4862, "num_tokens": 7438625404.0, "step": 1781 }, { "epoch": 3.5649887302779866, "grad_norm": 0.08947432126824544, "learning_rate": 1.119430467577843e-05, "loss": 0.4882, "num_tokens": 7442819708.0, "step": 1782 }, { "epoch": 3.566992236413724, "grad_norm": 0.09912897152321969, "learning_rate": 1.1175625559252731e-05, "loss": 0.4892, "num_tokens": 7446993576.0, "step": 1783 }, { "epoch": 3.5689957425494616, "grad_norm": 0.09166843597854364, "learning_rate": 1.115696468469797e-05, "loss": 0.4652, "num_tokens": 7451173974.0, "step": 1784 }, { "epoch": 3.570999248685199, "grad_norm": 0.08796156404560826, "learning_rate": 1.1138322083562728e-05, "loss": 0.4841, "num_tokens": 7455337088.0, "step": 1785 }, { "epoch": 3.5730027548209367, "grad_norm": 0.09466270291065457, "learning_rate": 1.1119697787264769e-05, "loss": 0.4784, "num_tokens": 7459531392.0, "step": 1786 }, { "epoch": 3.575006260956674, "grad_norm": 0.09147488579091695, "learning_rate": 1.1101091827191038e-05, "loss": 0.4781, "num_tokens": 7463711651.0, "step": 1787 }, { "epoch": 3.5770097670924117, "grad_norm": 0.09497869753265184, "learning_rate": 1.1082504234697548e-05, "loss": 0.4907, "num_tokens": 7467905955.0, "step": 1788 }, { "epoch": 3.579013273228149, "grad_norm": 0.08999317098759117, "learning_rate": 1.1063935041109379e-05, "loss": 0.4754, "num_tokens": 7472080096.0, "step": 1789 }, { "epoch": 3.5810167793638867, "grad_norm": 0.08433866909334893, "learning_rate": 1.10453842777206e-05, "loss": 0.4716, "num_tokens": 7476246230.0, "step": 1790 }, { "epoch": 3.5830202854996243, "grad_norm": 0.0917401546714576, "learning_rate": 1.1026851975794216e-05, "loss": 0.4848, "num_tokens": 7480425200.0, "step": 1791 }, { "epoch": 3.585023791635362, "grad_norm": 0.08508864329386331, "learning_rate": 1.1008338166562114e-05, "loss": 0.491, "num_tokens": 7484619504.0, "step": 1792 }, { "epoch": 3.5870272977710993, "grad_norm": 0.09769647905009582, "learning_rate": 1.0989842881225034e-05, "loss": 0.4988, "num_tokens": 7488794017.0, "step": 1793 }, { "epoch": 3.589030803906837, "grad_norm": 0.09360686664532683, "learning_rate": 1.0971366150952476e-05, "loss": 0.4884, "num_tokens": 7492988321.0, "step": 1794 }, { "epoch": 3.5910343100425743, "grad_norm": 0.09347648242625306, "learning_rate": 1.0952908006882692e-05, "loss": 0.4797, "num_tokens": 7497153320.0, "step": 1795 }, { "epoch": 3.5930378161783123, "grad_norm": 0.09430327511979292, "learning_rate": 1.0934468480122587e-05, "loss": 0.489, "num_tokens": 7501318761.0, "step": 1796 }, { "epoch": 3.5950413223140494, "grad_norm": 0.10096615006477672, "learning_rate": 1.0916047601747717e-05, "loss": 0.4855, "num_tokens": 7505513065.0, "step": 1797 }, { "epoch": 3.5970448284497873, "grad_norm": 0.09091507795075053, "learning_rate": 1.0897645402802176e-05, "loss": 0.4911, "num_tokens": 7509707369.0, "step": 1798 }, { "epoch": 3.5990483345855244, "grad_norm": 0.09091475628686278, "learning_rate": 1.0879261914298628e-05, "loss": 0.4727, "num_tokens": 7513900383.0, "step": 1799 }, { "epoch": 3.6010518407212624, "grad_norm": 0.10537450160676616, "learning_rate": 1.0860897167218156e-05, "loss": 0.4828, "num_tokens": 7518051279.0, "step": 1800 }, { "epoch": 3.603055346857, "grad_norm": 0.09410773880024782, "learning_rate": 1.0842551192510293e-05, "loss": 0.4888, "num_tokens": 7522245583.0, "step": 1801 }, { "epoch": 3.6050588529927374, "grad_norm": 0.09204687078802819, "learning_rate": 1.0824224021092905e-05, "loss": 0.4867, "num_tokens": 7526439887.0, "step": 1802 }, { "epoch": 3.607062359128475, "grad_norm": 0.09302386275413957, "learning_rate": 1.0805915683852204e-05, "loss": 0.4885, "num_tokens": 7530634191.0, "step": 1803 }, { "epoch": 3.6090658652642125, "grad_norm": 0.08570136474856334, "learning_rate": 1.0787626211642628e-05, "loss": 0.4754, "num_tokens": 7534828495.0, "step": 1804 }, { "epoch": 3.61106937139995, "grad_norm": 0.097373818896628, "learning_rate": 1.076935563528685e-05, "loss": 0.4873, "num_tokens": 7539022799.0, "step": 1805 }, { "epoch": 3.6130728775356875, "grad_norm": 0.0856147995060332, "learning_rate": 1.075110398557567e-05, "loss": 0.4886, "num_tokens": 7543217103.0, "step": 1806 }, { "epoch": 3.615076383671425, "grad_norm": 0.09345630340327807, "learning_rate": 1.0732871293268022e-05, "loss": 0.4818, "num_tokens": 7547393023.0, "step": 1807 }, { "epoch": 3.6170798898071626, "grad_norm": 0.08449538313684868, "learning_rate": 1.071465758909087e-05, "loss": 0.4792, "num_tokens": 7551575631.0, "step": 1808 }, { "epoch": 3.6190833959429, "grad_norm": 0.08773327307815101, "learning_rate": 1.0696462903739197e-05, "loss": 0.4871, "num_tokens": 7555769935.0, "step": 1809 }, { "epoch": 3.6210869020786376, "grad_norm": 0.09291699856981218, "learning_rate": 1.0678287267875906e-05, "loss": 0.4962, "num_tokens": 7559964239.0, "step": 1810 }, { "epoch": 3.623090408214375, "grad_norm": 0.0926558979583835, "learning_rate": 1.0660130712131829e-05, "loss": 0.493, "num_tokens": 7564137978.0, "step": 1811 }, { "epoch": 3.6250939143501126, "grad_norm": 0.08513760658505806, "learning_rate": 1.0641993267105607e-05, "loss": 0.4723, "num_tokens": 7568332282.0, "step": 1812 }, { "epoch": 3.62709742048585, "grad_norm": 0.10735286358529275, "learning_rate": 1.0623874963363718e-05, "loss": 0.487, "num_tokens": 7572503876.0, "step": 1813 }, { "epoch": 3.6291009266215877, "grad_norm": 0.09917748241610583, "learning_rate": 1.0605775831440335e-05, "loss": 0.5084, "num_tokens": 7576698180.0, "step": 1814 }, { "epoch": 3.631104432757325, "grad_norm": 0.08837876497651995, "learning_rate": 1.0587695901837361e-05, "loss": 0.4904, "num_tokens": 7580892484.0, "step": 1815 }, { "epoch": 3.6331079388930627, "grad_norm": 0.08899794791978886, "learning_rate": 1.0569635205024307e-05, "loss": 0.4852, "num_tokens": 7585086788.0, "step": 1816 }, { "epoch": 3.6351114450288002, "grad_norm": 0.09263528118367706, "learning_rate": 1.0551593771438306e-05, "loss": 0.4815, "num_tokens": 7589281092.0, "step": 1817 }, { "epoch": 3.6371149511645378, "grad_norm": 0.09177107969965823, "learning_rate": 1.053357163148399e-05, "loss": 0.4759, "num_tokens": 7593475396.0, "step": 1818 }, { "epoch": 3.6391184573002757, "grad_norm": 0.08646985564194207, "learning_rate": 1.0515568815533498e-05, "loss": 0.4813, "num_tokens": 7597669700.0, "step": 1819 }, { "epoch": 3.641121963436013, "grad_norm": 0.09323841523805781, "learning_rate": 1.0497585353926398e-05, "loss": 0.4705, "num_tokens": 7601864004.0, "step": 1820 }, { "epoch": 3.6431254695717508, "grad_norm": 0.09030667801480304, "learning_rate": 1.0479621276969638e-05, "loss": 0.4785, "num_tokens": 7606028109.0, "step": 1821 }, { "epoch": 3.645128975707488, "grad_norm": 0.08886779446880759, "learning_rate": 1.0461676614937513e-05, "loss": 0.4776, "num_tokens": 7610222413.0, "step": 1822 }, { "epoch": 3.647132481843226, "grad_norm": 0.08066871826334963, "learning_rate": 1.044375139807157e-05, "loss": 0.488, "num_tokens": 7614416717.0, "step": 1823 }, { "epoch": 3.6491359879789633, "grad_norm": 0.09054345605318044, "learning_rate": 1.0425845656580615e-05, "loss": 0.4937, "num_tokens": 7618582712.0, "step": 1824 }, { "epoch": 3.651139494114701, "grad_norm": 0.08801791389618266, "learning_rate": 1.0407959420640608e-05, "loss": 0.473, "num_tokens": 7622755636.0, "step": 1825 }, { "epoch": 3.6531430002504384, "grad_norm": 0.09562437678494654, "learning_rate": 1.0390092720394651e-05, "loss": 0.4828, "num_tokens": 7626942478.0, "step": 1826 }, { "epoch": 3.655146506386176, "grad_norm": 0.0893727364374112, "learning_rate": 1.0372245585952929e-05, "loss": 0.4865, "num_tokens": 7631136782.0, "step": 1827 }, { "epoch": 3.6571500125219134, "grad_norm": 0.09534248483227961, "learning_rate": 1.035441804739264e-05, "loss": 0.5024, "num_tokens": 7635301822.0, "step": 1828 }, { "epoch": 3.659153518657651, "grad_norm": 0.0942333878156433, "learning_rate": 1.0336610134757956e-05, "loss": 0.4914, "num_tokens": 7639476236.0, "step": 1829 }, { "epoch": 3.6611570247933884, "grad_norm": 0.08883273230782071, "learning_rate": 1.0318821878059997e-05, "loss": 0.4904, "num_tokens": 7643645292.0, "step": 1830 }, { "epoch": 3.663160530929126, "grad_norm": 0.09348474047716002, "learning_rate": 1.0301053307276728e-05, "loss": 0.491, "num_tokens": 7647810340.0, "step": 1831 }, { "epoch": 3.6651640370648635, "grad_norm": 0.09125702740829993, "learning_rate": 1.0283304452352956e-05, "loss": 0.5056, "num_tokens": 7651977600.0, "step": 1832 }, { "epoch": 3.667167543200601, "grad_norm": 0.08416923456695984, "learning_rate": 1.026557534320025e-05, "loss": 0.505, "num_tokens": 7656171904.0, "step": 1833 }, { "epoch": 3.6691710493363385, "grad_norm": 0.09174999718926198, "learning_rate": 1.0247866009696915e-05, "loss": 0.4901, "num_tokens": 7660366208.0, "step": 1834 }, { "epoch": 3.671174555472076, "grad_norm": 0.09156888361566115, "learning_rate": 1.0230176481687922e-05, "loss": 0.477, "num_tokens": 7664560512.0, "step": 1835 }, { "epoch": 3.6731780616078136, "grad_norm": 0.0867176745029227, "learning_rate": 1.021250678898487e-05, "loss": 0.4753, "num_tokens": 7668754816.0, "step": 1836 }, { "epoch": 3.675181567743551, "grad_norm": 0.09130438671804711, "learning_rate": 1.0194856961365916e-05, "loss": 0.4885, "num_tokens": 7672930545.0, "step": 1837 }, { "epoch": 3.6771850738792886, "grad_norm": 0.08469867140864343, "learning_rate": 1.0177227028575758e-05, "loss": 0.4799, "num_tokens": 7677115024.0, "step": 1838 }, { "epoch": 3.679188580015026, "grad_norm": 0.08273532353040078, "learning_rate": 1.0159617020325543e-05, "loss": 0.4788, "num_tokens": 7681284841.0, "step": 1839 }, { "epoch": 3.681192086150764, "grad_norm": 0.08647507179538337, "learning_rate": 1.0142026966292865e-05, "loss": 0.4814, "num_tokens": 7685469816.0, "step": 1840 }, { "epoch": 3.683195592286501, "grad_norm": 0.08912974820080141, "learning_rate": 1.0124456896121667e-05, "loss": 0.5027, "num_tokens": 7689642132.0, "step": 1841 }, { "epoch": 3.685199098422239, "grad_norm": 0.08253333655652785, "learning_rate": 1.0106906839422239e-05, "loss": 0.4905, "num_tokens": 7693836436.0, "step": 1842 }, { "epoch": 3.687202604557976, "grad_norm": 0.08811294860337093, "learning_rate": 1.0089376825771107e-05, "loss": 0.482, "num_tokens": 7698021852.0, "step": 1843 }, { "epoch": 3.689206110693714, "grad_norm": 0.0847944520438243, "learning_rate": 1.0071866884711063e-05, "loss": 0.4857, "num_tokens": 7702216156.0, "step": 1844 }, { "epoch": 3.6912096168294517, "grad_norm": 0.08361350507495152, "learning_rate": 1.0054377045751033e-05, "loss": 0.485, "num_tokens": 7706410460.0, "step": 1845 }, { "epoch": 3.693213122965189, "grad_norm": 0.08597531157421934, "learning_rate": 1.0036907338366096e-05, "loss": 0.4755, "num_tokens": 7710600421.0, "step": 1846 }, { "epoch": 3.6952166291009267, "grad_norm": 0.0788084085965518, "learning_rate": 1.0019457791997373e-05, "loss": 0.4899, "num_tokens": 7714794725.0, "step": 1847 }, { "epoch": 3.6972201352366643, "grad_norm": 0.0871810632141608, "learning_rate": 1.0002028436052043e-05, "loss": 0.4797, "num_tokens": 7718961156.0, "step": 1848 }, { "epoch": 3.699223641372402, "grad_norm": 0.09164990887751884, "learning_rate": 9.98461929990322e-06, "loss": 0.486, "num_tokens": 7723155460.0, "step": 1849 }, { "epoch": 3.7012271475081393, "grad_norm": 0.09036741031330565, "learning_rate": 9.967230412889978e-06, "loss": 0.4966, "num_tokens": 7727349764.0, "step": 1850 }, { "epoch": 3.703230653643877, "grad_norm": 0.09094215309826684, "learning_rate": 9.949861804317234e-06, "loss": 0.4927, "num_tokens": 7731509566.0, "step": 1851 }, { "epoch": 3.7052341597796143, "grad_norm": 0.0920311006275612, "learning_rate": 9.932513503455771e-06, "loss": 0.4904, "num_tokens": 7735703194.0, "step": 1852 }, { "epoch": 3.707237665915352, "grad_norm": 0.08482922648291695, "learning_rate": 9.915185539542103e-06, "loss": 0.4943, "num_tokens": 7739897498.0, "step": 1853 }, { "epoch": 3.7092411720510894, "grad_norm": 0.7809233171794516, "learning_rate": 9.897877941778505e-06, "loss": 0.4788, "num_tokens": 7744059511.0, "step": 1854 }, { "epoch": 3.711244678186827, "grad_norm": 0.10611000582299297, "learning_rate": 9.880590739332904e-06, "loss": 0.4904, "num_tokens": 7748253815.0, "step": 1855 }, { "epoch": 3.7132481843225644, "grad_norm": 0.10111533062789255, "learning_rate": 9.863323961338878e-06, "loss": 0.49, "num_tokens": 7752448119.0, "step": 1856 }, { "epoch": 3.715251690458302, "grad_norm": 0.09479155124689459, "learning_rate": 9.846077636895566e-06, "loss": 0.4805, "num_tokens": 7756616474.0, "step": 1857 }, { "epoch": 3.7172551965940395, "grad_norm": 0.09142412975464015, "learning_rate": 9.828851795067657e-06, "loss": 0.5042, "num_tokens": 7760752031.0, "step": 1858 }, { "epoch": 3.719258702729777, "grad_norm": 0.10919988874140432, "learning_rate": 9.811646464885292e-06, "loss": 0.4904, "num_tokens": 7764946335.0, "step": 1859 }, { "epoch": 3.7212622088655145, "grad_norm": 0.08642817827967057, "learning_rate": 9.79446167534407e-06, "loss": 0.4794, "num_tokens": 7769140639.0, "step": 1860 }, { "epoch": 3.723265715001252, "grad_norm": 0.09512064822219733, "learning_rate": 9.777297455404971e-06, "loss": 0.4887, "num_tokens": 7773325279.0, "step": 1861 }, { "epoch": 3.7252692211369896, "grad_norm": 0.086965834508418, "learning_rate": 9.760153833994306e-06, "loss": 0.4757, "num_tokens": 7777519583.0, "step": 1862 }, { "epoch": 3.7272727272727275, "grad_norm": 0.08222958453802033, "learning_rate": 9.743030840003655e-06, "loss": 0.4808, "num_tokens": 7781683970.0, "step": 1863 }, { "epoch": 3.7292762334084646, "grad_norm": 0.08244616510503312, "learning_rate": 9.725928502289868e-06, "loss": 0.4854, "num_tokens": 7785858340.0, "step": 1864 }, { "epoch": 3.7312797395442026, "grad_norm": 0.09720690211329475, "learning_rate": 9.70884684967495e-06, "loss": 0.4825, "num_tokens": 7790026729.0, "step": 1865 }, { "epoch": 3.7332832456799396, "grad_norm": 0.0893935232033245, "learning_rate": 9.691785910946076e-06, "loss": 0.4739, "num_tokens": 7794221033.0, "step": 1866 }, { "epoch": 3.7352867518156776, "grad_norm": 0.08949288628153199, "learning_rate": 9.674745714855486e-06, "loss": 0.4847, "num_tokens": 7798397432.0, "step": 1867 }, { "epoch": 3.737290257951415, "grad_norm": 0.08879623278151602, "learning_rate": 9.657726290120489e-06, "loss": 0.4944, "num_tokens": 7802591736.0, "step": 1868 }, { "epoch": 3.7392937640871526, "grad_norm": 0.0902121421297967, "learning_rate": 9.640727665423357e-06, "loss": 0.4933, "num_tokens": 7806769453.0, "step": 1869 }, { "epoch": 3.74129727022289, "grad_norm": 0.08568534512515906, "learning_rate": 9.623749869411347e-06, "loss": 0.4842, "num_tokens": 7810963757.0, "step": 1870 }, { "epoch": 3.7433007763586277, "grad_norm": 0.08317721663667663, "learning_rate": 9.606792930696581e-06, "loss": 0.4859, "num_tokens": 7815127159.0, "step": 1871 }, { "epoch": 3.745304282494365, "grad_norm": 0.09076021315446207, "learning_rate": 9.589856877856053e-06, "loss": 0.485, "num_tokens": 7819321463.0, "step": 1872 }, { "epoch": 3.7473077886301027, "grad_norm": 0.08548214269969256, "learning_rate": 9.572941739431538e-06, "loss": 0.4925, "num_tokens": 7823515767.0, "step": 1873 }, { "epoch": 3.7493112947658402, "grad_norm": 0.08501403527411415, "learning_rate": 9.55604754392959e-06, "loss": 0.4785, "num_tokens": 7827674039.0, "step": 1874 }, { "epoch": 3.7513148009015778, "grad_norm": 0.08358664650802249, "learning_rate": 9.539174319821442e-06, "loss": 0.4974, "num_tokens": 7831851941.0, "step": 1875 }, { "epoch": 3.7533183070373153, "grad_norm": 0.09640496850264699, "learning_rate": 9.522322095543005e-06, "loss": 0.4779, "num_tokens": 7836046245.0, "step": 1876 }, { "epoch": 3.755321813173053, "grad_norm": 0.0955065540947874, "learning_rate": 9.505490899494783e-06, "loss": 0.4929, "num_tokens": 7840240549.0, "step": 1877 }, { "epoch": 3.7573253193087903, "grad_norm": 0.08936904243141344, "learning_rate": 9.488680760041859e-06, "loss": 0.478, "num_tokens": 7844434853.0, "step": 1878 }, { "epoch": 3.759328825444528, "grad_norm": 0.0835909044379709, "learning_rate": 9.471891705513817e-06, "loss": 0.4854, "num_tokens": 7848618378.0, "step": 1879 }, { "epoch": 3.7613323315802654, "grad_norm": 0.08682382260402165, "learning_rate": 9.45512376420472e-06, "loss": 0.4803, "num_tokens": 7852812682.0, "step": 1880 }, { "epoch": 3.763335837716003, "grad_norm": 0.09246571733016484, "learning_rate": 9.438376964373032e-06, "loss": 0.4803, "num_tokens": 7857006986.0, "step": 1881 }, { "epoch": 3.7653393438517404, "grad_norm": 0.08716789821474277, "learning_rate": 9.421651334241607e-06, "loss": 0.4919, "num_tokens": 7861184809.0, "step": 1882 }, { "epoch": 3.767342849987478, "grad_norm": 0.08624943721838421, "learning_rate": 9.4049469019976e-06, "loss": 0.5009, "num_tokens": 7865379113.0, "step": 1883 }, { "epoch": 3.769346356123216, "grad_norm": 0.09312554416690697, "learning_rate": 9.388263695792467e-06, "loss": 0.4999, "num_tokens": 7869573417.0, "step": 1884 }, { "epoch": 3.771349862258953, "grad_norm": 0.0863203702515645, "learning_rate": 9.371601743741868e-06, "loss": 0.4892, "num_tokens": 7873767721.0, "step": 1885 }, { "epoch": 3.773353368394691, "grad_norm": 0.07944246608186843, "learning_rate": 9.354961073925665e-06, "loss": 0.4971, "num_tokens": 7877941705.0, "step": 1886 }, { "epoch": 3.775356874530428, "grad_norm": 0.08637839408201471, "learning_rate": 9.338341714387842e-06, "loss": 0.4876, "num_tokens": 7882120181.0, "step": 1887 }, { "epoch": 3.777360380666166, "grad_norm": 0.08864791320863008, "learning_rate": 9.321743693136482e-06, "loss": 0.4826, "num_tokens": 7886314485.0, "step": 1888 }, { "epoch": 3.7793638868019035, "grad_norm": 0.08833034650076241, "learning_rate": 9.30516703814368e-06, "loss": 0.4815, "num_tokens": 7890486935.0, "step": 1889 }, { "epoch": 3.781367392937641, "grad_norm": 0.0896859040229601, "learning_rate": 9.288611777345554e-06, "loss": 0.4865, "num_tokens": 7894681239.0, "step": 1890 }, { "epoch": 3.7833708990733785, "grad_norm": 0.08727926668930727, "learning_rate": 9.272077938642147e-06, "loss": 0.4903, "num_tokens": 7898875543.0, "step": 1891 }, { "epoch": 3.785374405209116, "grad_norm": 0.08363895361290248, "learning_rate": 9.255565549897409e-06, "loss": 0.4818, "num_tokens": 7903048447.0, "step": 1892 }, { "epoch": 3.7873779113448536, "grad_norm": 0.08388291308064223, "learning_rate": 9.239074638939144e-06, "loss": 0.4958, "num_tokens": 7907242751.0, "step": 1893 }, { "epoch": 3.789381417480591, "grad_norm": 0.09303505069571412, "learning_rate": 9.222605233558946e-06, "loss": 0.4907, "num_tokens": 7911414035.0, "step": 1894 }, { "epoch": 3.7913849236163286, "grad_norm": 0.07873362539952093, "learning_rate": 9.206157361512182e-06, "loss": 0.4749, "num_tokens": 7915608339.0, "step": 1895 }, { "epoch": 3.793388429752066, "grad_norm": 0.08965049383430052, "learning_rate": 9.18973105051792e-06, "loss": 0.4913, "num_tokens": 7919798171.0, "step": 1896 }, { "epoch": 3.7953919358878037, "grad_norm": 0.08310126891083344, "learning_rate": 9.173326328258893e-06, "loss": 0.484, "num_tokens": 7923991670.0, "step": 1897 }, { "epoch": 3.797395442023541, "grad_norm": 0.08447605736484394, "learning_rate": 9.156943222381456e-06, "loss": 0.4841, "num_tokens": 7928173141.0, "step": 1898 }, { "epoch": 3.7993989481592787, "grad_norm": 0.08294291439525785, "learning_rate": 9.140581760495538e-06, "loss": 0.4707, "num_tokens": 7932356615.0, "step": 1899 }, { "epoch": 3.801402454295016, "grad_norm": 0.08986485619804743, "learning_rate": 9.124241970174575e-06, "loss": 0.4779, "num_tokens": 7936521019.0, "step": 1900 }, { "epoch": 3.8034059604307537, "grad_norm": 0.08865096906511742, "learning_rate": 9.1079238789555e-06, "loss": 0.4974, "num_tokens": 7940674164.0, "step": 1901 }, { "epoch": 3.8054094665664913, "grad_norm": 0.08549856766512204, "learning_rate": 9.091627514338658e-06, "loss": 0.4909, "num_tokens": 7944868468.0, "step": 1902 }, { "epoch": 3.807412972702229, "grad_norm": 0.09333895365020572, "learning_rate": 9.075352903787802e-06, "loss": 0.489, "num_tokens": 7949047222.0, "step": 1903 }, { "epoch": 3.8094164788379663, "grad_norm": 0.0924074058239638, "learning_rate": 9.05910007473e-06, "loss": 0.4884, "num_tokens": 7953241526.0, "step": 1904 }, { "epoch": 3.811419984973704, "grad_norm": 0.07991018542739434, "learning_rate": 9.042869054555625e-06, "loss": 0.5029, "num_tokens": 7957435830.0, "step": 1905 }, { "epoch": 3.8134234911094413, "grad_norm": 0.09468404281878016, "learning_rate": 9.0266598706183e-06, "loss": 0.4869, "num_tokens": 7961606504.0, "step": 1906 }, { "epoch": 3.8154269972451793, "grad_norm": 0.09044560686002914, "learning_rate": 9.010472550234848e-06, "loss": 0.4972, "num_tokens": 7965800808.0, "step": 1907 }, { "epoch": 3.8174305033809164, "grad_norm": 0.0846287156512065, "learning_rate": 8.994307120685224e-06, "loss": 0.4915, "num_tokens": 7969995112.0, "step": 1908 }, { "epoch": 3.8194340095166543, "grad_norm": 0.08545723766133585, "learning_rate": 8.97816360921252e-06, "loss": 0.4865, "num_tokens": 7974189416.0, "step": 1909 }, { "epoch": 3.8214375156523914, "grad_norm": 0.08706670069500869, "learning_rate": 8.962042043022873e-06, "loss": 0.4848, "num_tokens": 7978352828.0, "step": 1910 }, { "epoch": 3.8234410217881294, "grad_norm": 0.08723206369438637, "learning_rate": 8.945942449285447e-06, "loss": 0.4767, "num_tokens": 7982547132.0, "step": 1911 }, { "epoch": 3.825444527923867, "grad_norm": 0.09032380780422834, "learning_rate": 8.92986485513236e-06, "loss": 0.4817, "num_tokens": 7986728882.0, "step": 1912 }, { "epoch": 3.8274480340596044, "grad_norm": 0.08235610572466291, "learning_rate": 8.91380928765868e-06, "loss": 0.4923, "num_tokens": 7990907242.0, "step": 1913 }, { "epoch": 3.829451540195342, "grad_norm": 0.08180697030017117, "learning_rate": 8.897775773922316e-06, "loss": 0.4799, "num_tokens": 7995091674.0, "step": 1914 }, { "epoch": 3.8314550463310795, "grad_norm": 0.09347698073434159, "learning_rate": 8.881764340944063e-06, "loss": 0.4847, "num_tokens": 7999245325.0, "step": 1915 }, { "epoch": 3.833458552466817, "grad_norm": 0.07978165980602425, "learning_rate": 8.865775015707461e-06, "loss": 0.4933, "num_tokens": 8003415108.0, "step": 1916 }, { "epoch": 3.8354620586025545, "grad_norm": 0.08357009696169472, "learning_rate": 8.84980782515881e-06, "loss": 0.4958, "num_tokens": 8007586072.0, "step": 1917 }, { "epoch": 3.837465564738292, "grad_norm": 0.09819544858449104, "learning_rate": 8.833862796207096e-06, "loss": 0.478, "num_tokens": 8011780376.0, "step": 1918 }, { "epoch": 3.8394690708740296, "grad_norm": 0.08308410618349651, "learning_rate": 8.817939955723977e-06, "loss": 0.4802, "num_tokens": 8015972986.0, "step": 1919 }, { "epoch": 3.841472577009767, "grad_norm": 0.08918538922832508, "learning_rate": 8.802039330543693e-06, "loss": 0.4736, "num_tokens": 8020167290.0, "step": 1920 }, { "epoch": 3.8434760831455046, "grad_norm": 0.08924616404165715, "learning_rate": 8.78616094746307e-06, "loss": 0.4743, "num_tokens": 8024314458.0, "step": 1921 }, { "epoch": 3.845479589281242, "grad_norm": 0.08487576961541658, "learning_rate": 8.770304833241424e-06, "loss": 0.4914, "num_tokens": 8028508762.0, "step": 1922 }, { "epoch": 3.8474830954169796, "grad_norm": 0.08051043467001406, "learning_rate": 8.75447101460056e-06, "loss": 0.4866, "num_tokens": 8032693785.0, "step": 1923 }, { "epoch": 3.849486601552717, "grad_norm": 0.08925026993600654, "learning_rate": 8.73865951822471e-06, "loss": 0.4846, "num_tokens": 8036888089.0, "step": 1924 }, { "epoch": 3.8514901076884547, "grad_norm": 0.08569372419558699, "learning_rate": 8.722870370760484e-06, "loss": 0.4826, "num_tokens": 8041082393.0, "step": 1925 }, { "epoch": 3.853493613824192, "grad_norm": 0.08142795518465076, "learning_rate": 8.707103598816813e-06, "loss": 0.4854, "num_tokens": 8045273663.0, "step": 1926 }, { "epoch": 3.8554971199599297, "grad_norm": 0.094637699366646, "learning_rate": 8.691359228964946e-06, "loss": 0.4844, "num_tokens": 8049459428.0, "step": 1927 }, { "epoch": 3.8575006260956677, "grad_norm": 0.08037032150618319, "learning_rate": 8.675637287738353e-06, "loss": 0.4787, "num_tokens": 8053653732.0, "step": 1928 }, { "epoch": 3.8595041322314048, "grad_norm": 0.08104611614586435, "learning_rate": 8.659937801632723e-06, "loss": 0.4814, "num_tokens": 8057823690.0, "step": 1929 }, { "epoch": 3.8615076383671427, "grad_norm": 0.0826588947275504, "learning_rate": 8.644260797105893e-06, "loss": 0.4789, "num_tokens": 8062017994.0, "step": 1930 }, { "epoch": 3.86351114450288, "grad_norm": 0.07831287175919883, "learning_rate": 8.628606300577819e-06, "loss": 0.4838, "num_tokens": 8066200746.0, "step": 1931 }, { "epoch": 3.8655146506386178, "grad_norm": 0.0807237553548028, "learning_rate": 8.61297433843051e-06, "loss": 0.4876, "num_tokens": 8070377339.0, "step": 1932 }, { "epoch": 3.8675181567743553, "grad_norm": 0.08627559988538329, "learning_rate": 8.597364937008033e-06, "loss": 0.4842, "num_tokens": 8074556580.0, "step": 1933 }, { "epoch": 3.869521662910093, "grad_norm": 0.0839822555522512, "learning_rate": 8.581778122616392e-06, "loss": 0.4965, "num_tokens": 8078750884.0, "step": 1934 }, { "epoch": 3.8715251690458303, "grad_norm": 0.09462127442315006, "learning_rate": 8.566213921523556e-06, "loss": 0.4837, "num_tokens": 8082945188.0, "step": 1935 }, { "epoch": 3.873528675181568, "grad_norm": 0.08536405826899614, "learning_rate": 8.550672359959366e-06, "loss": 0.4917, "num_tokens": 8087139492.0, "step": 1936 }, { "epoch": 3.8755321813173054, "grad_norm": 0.09064275306513804, "learning_rate": 8.535153464115526e-06, "loss": 0.4894, "num_tokens": 8091302762.0, "step": 1937 }, { "epoch": 3.877535687453043, "grad_norm": 0.0796033518000145, "learning_rate": 8.519657260145522e-06, "loss": 0.4941, "num_tokens": 8095491947.0, "step": 1938 }, { "epoch": 3.8795391935887804, "grad_norm": 0.09535628252506191, "learning_rate": 8.504183774164621e-06, "loss": 0.4882, "num_tokens": 8099686251.0, "step": 1939 }, { "epoch": 3.881542699724518, "grad_norm": 0.10563180151417488, "learning_rate": 8.488733032249774e-06, "loss": 0.4876, "num_tokens": 8103853767.0, "step": 1940 }, { "epoch": 3.8835462058602555, "grad_norm": 0.08103329195287359, "learning_rate": 8.473305060439646e-06, "loss": 0.4894, "num_tokens": 8108022284.0, "step": 1941 }, { "epoch": 3.885549711995993, "grad_norm": 0.10055084904598675, "learning_rate": 8.457899884734483e-06, "loss": 0.4872, "num_tokens": 8112216588.0, "step": 1942 }, { "epoch": 3.8875532181317305, "grad_norm": 0.09395881455279576, "learning_rate": 8.442517531096149e-06, "loss": 0.4767, "num_tokens": 8116410892.0, "step": 1943 }, { "epoch": 3.889556724267468, "grad_norm": 0.08081273541425969, "learning_rate": 8.427158025448014e-06, "loss": 0.4775, "num_tokens": 8120583185.0, "step": 1944 }, { "epoch": 3.8915602304032055, "grad_norm": 0.09326247360302085, "learning_rate": 8.411821393674975e-06, "loss": 0.4885, "num_tokens": 8124760262.0, "step": 1945 }, { "epoch": 3.893563736538943, "grad_norm": 0.08741870766527744, "learning_rate": 8.396507661623355e-06, "loss": 0.4926, "num_tokens": 8128954566.0, "step": 1946 }, { "epoch": 3.8955672426746806, "grad_norm": 0.09043094809004079, "learning_rate": 8.381216855100901e-06, "loss": 0.493, "num_tokens": 8133148870.0, "step": 1947 }, { "epoch": 3.897570748810418, "grad_norm": 0.0958617907954117, "learning_rate": 8.365948999876714e-06, "loss": 0.4921, "num_tokens": 8137343174.0, "step": 1948 }, { "epoch": 3.8995742549461556, "grad_norm": 0.09790772111632152, "learning_rate": 8.350704121681222e-06, "loss": 0.4896, "num_tokens": 8141537478.0, "step": 1949 }, { "epoch": 3.901577761081893, "grad_norm": 0.0778729802473557, "learning_rate": 8.335482246206134e-06, "loss": 0.4805, "num_tokens": 8145711251.0, "step": 1950 }, { "epoch": 3.903581267217631, "grad_norm": 0.08849609117145635, "learning_rate": 8.320283399104387e-06, "loss": 0.4872, "num_tokens": 8149887634.0, "step": 1951 }, { "epoch": 3.905584773353368, "grad_norm": 0.10772093434950156, "learning_rate": 8.305107605990103e-06, "loss": 0.4865, "num_tokens": 8154081938.0, "step": 1952 }, { "epoch": 3.907588279489106, "grad_norm": 0.08254566951404374, "learning_rate": 8.28995489243857e-06, "loss": 0.4777, "num_tokens": 8158254619.0, "step": 1953 }, { "epoch": 3.909591785624843, "grad_norm": 0.08720216140427856, "learning_rate": 8.274825283986158e-06, "loss": 0.4957, "num_tokens": 8162428661.0, "step": 1954 }, { "epoch": 3.911595291760581, "grad_norm": 0.09537281680140597, "learning_rate": 8.259718806130322e-06, "loss": 0.4891, "num_tokens": 8166622965.0, "step": 1955 }, { "epoch": 3.9135987978963187, "grad_norm": 0.13064833257952205, "learning_rate": 8.244635484329519e-06, "loss": 0.4846, "num_tokens": 8170817269.0, "step": 1956 }, { "epoch": 3.9156023040320562, "grad_norm": 0.08029616794336981, "learning_rate": 8.229575344003185e-06, "loss": 0.4953, "num_tokens": 8175011573.0, "step": 1957 }, { "epoch": 3.9176058101677937, "grad_norm": 0.10244749837500322, "learning_rate": 8.214538410531694e-06, "loss": 0.4854, "num_tokens": 8179188212.0, "step": 1958 }, { "epoch": 3.9196093163035313, "grad_norm": 0.11767011119856459, "learning_rate": 8.199524709256312e-06, "loss": 0.4976, "num_tokens": 8183378590.0, "step": 1959 }, { "epoch": 3.921612822439269, "grad_norm": 0.09424809304374154, "learning_rate": 8.184534265479139e-06, "loss": 0.486, "num_tokens": 8187551424.0, "step": 1960 }, { "epoch": 3.9236163285750063, "grad_norm": 0.09695945775149666, "learning_rate": 8.169567104463098e-06, "loss": 0.4877, "num_tokens": 8191693445.0, "step": 1961 }, { "epoch": 3.925619834710744, "grad_norm": 0.13055195858598787, "learning_rate": 8.154623251431857e-06, "loss": 0.4985, "num_tokens": 8195864779.0, "step": 1962 }, { "epoch": 3.9276233408464813, "grad_norm": 0.09391505990155703, "learning_rate": 8.139702731569815e-06, "loss": 0.4987, "num_tokens": 8200059083.0, "step": 1963 }, { "epoch": 3.929626846982219, "grad_norm": 0.0912111974793315, "learning_rate": 8.124805570022051e-06, "loss": 0.4678, "num_tokens": 8204244261.0, "step": 1964 }, { "epoch": 3.9316303531179564, "grad_norm": 0.10790050357971083, "learning_rate": 8.109931791894265e-06, "loss": 0.4838, "num_tokens": 8208438565.0, "step": 1965 }, { "epoch": 3.933633859253694, "grad_norm": 0.10437970683297804, "learning_rate": 8.095081422252764e-06, "loss": 0.4838, "num_tokens": 8212632869.0, "step": 1966 }, { "epoch": 3.9356373653894314, "grad_norm": 0.08659920822221678, "learning_rate": 8.080254486124393e-06, "loss": 0.4846, "num_tokens": 8216827173.0, "step": 1967 }, { "epoch": 3.937640871525169, "grad_norm": 0.10187378596836923, "learning_rate": 8.065451008496517e-06, "loss": 0.4814, "num_tokens": 8221021477.0, "step": 1968 }, { "epoch": 3.9396443776609065, "grad_norm": 0.09744494775961673, "learning_rate": 8.05067101431696e-06, "loss": 0.4936, "num_tokens": 8225205512.0, "step": 1969 }, { "epoch": 3.941647883796644, "grad_norm": 0.08321861514627513, "learning_rate": 8.035914528493978e-06, "loss": 0.4953, "num_tokens": 8229399816.0, "step": 1970 }, { "epoch": 3.9436513899323815, "grad_norm": 0.0941980610493231, "learning_rate": 8.021181575896191e-06, "loss": 0.4872, "num_tokens": 8233594120.0, "step": 1971 }, { "epoch": 3.9456548960681195, "grad_norm": 0.12257105171928834, "learning_rate": 8.006472181352585e-06, "loss": 0.5004, "num_tokens": 8237780163.0, "step": 1972 }, { "epoch": 3.9476584022038566, "grad_norm": 0.0878160085436083, "learning_rate": 7.991786369652419e-06, "loss": 0.4806, "num_tokens": 8241974467.0, "step": 1973 }, { "epoch": 3.9496619083395945, "grad_norm": 0.09059880273691032, "learning_rate": 7.977124165545226e-06, "loss": 0.4901, "num_tokens": 8246168771.0, "step": 1974 }, { "epoch": 3.9516654144753316, "grad_norm": 0.10200598193501738, "learning_rate": 7.962485593740747e-06, "loss": 0.4815, "num_tokens": 8250363075.0, "step": 1975 }, { "epoch": 3.9536689206110696, "grad_norm": 0.09152591576385186, "learning_rate": 7.947870678908896e-06, "loss": 0.4873, "num_tokens": 8254557379.0, "step": 1976 }, { "epoch": 3.9556724267468066, "grad_norm": 0.0828483624698856, "learning_rate": 7.933279445679723e-06, "loss": 0.4725, "num_tokens": 8258751683.0, "step": 1977 }, { "epoch": 3.9576759328825446, "grad_norm": 0.09334441139241996, "learning_rate": 7.918711918643365e-06, "loss": 0.491, "num_tokens": 8262945987.0, "step": 1978 }, { "epoch": 3.959679439018282, "grad_norm": 0.0976008600421356, "learning_rate": 7.904168122350001e-06, "loss": 0.489, "num_tokens": 8267140291.0, "step": 1979 }, { "epoch": 3.9616829451540196, "grad_norm": 0.08807867534692704, "learning_rate": 7.889648081309838e-06, "loss": 0.4851, "num_tokens": 8271332274.0, "step": 1980 }, { "epoch": 3.963686451289757, "grad_norm": 0.09027208490901717, "learning_rate": 7.875151819993023e-06, "loss": 0.4867, "num_tokens": 8275501053.0, "step": 1981 }, { "epoch": 3.9656899574254947, "grad_norm": 0.096933304587064, "learning_rate": 7.860679362829646e-06, "loss": 0.4847, "num_tokens": 8279695357.0, "step": 1982 }, { "epoch": 3.967693463561232, "grad_norm": 0.08197270570764244, "learning_rate": 7.846230734209671e-06, "loss": 0.4661, "num_tokens": 8283889661.0, "step": 1983 }, { "epoch": 3.9696969696969697, "grad_norm": 0.09013254268166136, "learning_rate": 7.831805958482914e-06, "loss": 0.4787, "num_tokens": 8288077270.0, "step": 1984 }, { "epoch": 3.9717004758327072, "grad_norm": 0.09143599073218248, "learning_rate": 7.817405059958973e-06, "loss": 0.4933, "num_tokens": 8292270145.0, "step": 1985 }, { "epoch": 3.9737039819684448, "grad_norm": 0.08789686592758524, "learning_rate": 7.80302806290724e-06, "loss": 0.4893, "num_tokens": 8296439836.0, "step": 1986 }, { "epoch": 3.9757074881041823, "grad_norm": 0.09148789111812103, "learning_rate": 7.788674991556791e-06, "loss": 0.4787, "num_tokens": 8300606729.0, "step": 1987 }, { "epoch": 3.97771099423992, "grad_norm": 0.09145764404076508, "learning_rate": 7.774345870096408e-06, "loss": 0.4743, "num_tokens": 8304778997.0, "step": 1988 }, { "epoch": 3.9797145003756573, "grad_norm": 0.08293295826616845, "learning_rate": 7.760040722674488e-06, "loss": 0.4842, "num_tokens": 8308970439.0, "step": 1989 }, { "epoch": 3.981718006511395, "grad_norm": 0.09640991348747431, "learning_rate": 7.745759573399053e-06, "loss": 0.4903, "num_tokens": 8313164743.0, "step": 1990 }, { "epoch": 3.9837215126471324, "grad_norm": 0.08552303025680096, "learning_rate": 7.731502446337646e-06, "loss": 0.483, "num_tokens": 8317359047.0, "step": 1991 }, { "epoch": 3.98572501878287, "grad_norm": 0.0841581067516527, "learning_rate": 7.717269365517363e-06, "loss": 0.4903, "num_tokens": 8321489193.0, "step": 1992 }, { "epoch": 3.9877285249186074, "grad_norm": 0.08646840959265159, "learning_rate": 7.703060354924745e-06, "loss": 0.4963, "num_tokens": 8325661939.0, "step": 1993 }, { "epoch": 3.989732031054345, "grad_norm": 0.08642571980678163, "learning_rate": 7.688875438505789e-06, "loss": 0.4877, "num_tokens": 8329825952.0, "step": 1994 }, { "epoch": 3.991735537190083, "grad_norm": 0.09039061013320243, "learning_rate": 7.674714640165876e-06, "loss": 0.487, "num_tokens": 8334020256.0, "step": 1995 }, { "epoch": 3.99373904332582, "grad_norm": 0.08899275256095682, "learning_rate": 7.660577983769753e-06, "loss": 0.477, "num_tokens": 8338194089.0, "step": 1996 }, { "epoch": 3.995742549461558, "grad_norm": 0.08263152183985834, "learning_rate": 7.646465493141462e-06, "loss": 0.4912, "num_tokens": 8342388393.0, "step": 1997 }, { "epoch": 3.997746055597295, "grad_norm": 0.08861090930165598, "learning_rate": 7.632377192064343e-06, "loss": 0.486, "num_tokens": 8346576476.0, "step": 1998 }, { "epoch": 3.999749561733033, "grad_norm": 0.08231676308578938, "learning_rate": 7.618313104280947e-06, "loss": 0.4907, "num_tokens": 8350770780.0, "step": 1999 }, { "epoch": 4.0, "grad_norm": 0.08231676308578938, "learning_rate": 7.604273253493037e-06, "loss": 0.4999, "num_tokens": 8351295068.0, "step": 2000 }, { "epoch": 4.002003506135738, "grad_norm": 0.22741979277896124, "learning_rate": 7.590257663361516e-06, "loss": 0.4709, "num_tokens": 8355489372.0, "step": 2001 }, { "epoch": 4.004007012271475, "grad_norm": 0.09594920695785725, "learning_rate": 7.576266357506417e-06, "loss": 0.4747, "num_tokens": 8359664947.0, "step": 2002 }, { "epoch": 4.006010518407213, "grad_norm": 0.09220119710894949, "learning_rate": 7.562299359506824e-06, "loss": 0.4681, "num_tokens": 8363855904.0, "step": 2003 }, { "epoch": 4.00801402454295, "grad_norm": 0.10179116775825146, "learning_rate": 7.548356692900891e-06, "loss": 0.4691, "num_tokens": 8368043379.0, "step": 2004 }, { "epoch": 4.010017530678688, "grad_norm": 0.09253062417559947, "learning_rate": 7.534438381185734e-06, "loss": 0.466, "num_tokens": 8372237683.0, "step": 2005 }, { "epoch": 4.012021036814425, "grad_norm": 0.08935492068369684, "learning_rate": 7.520544447817441e-06, "loss": 0.4633, "num_tokens": 8376419968.0, "step": 2006 }, { "epoch": 4.014024542950163, "grad_norm": 0.0978307941738397, "learning_rate": 7.506674916211008e-06, "loss": 0.4609, "num_tokens": 8380614272.0, "step": 2007 }, { "epoch": 4.0160280490859, "grad_norm": 0.09883228145276221, "learning_rate": 7.492829809740319e-06, "loss": 0.4682, "num_tokens": 8384808576.0, "step": 2008 }, { "epoch": 4.018031555221638, "grad_norm": 0.08883872827682023, "learning_rate": 7.479009151738078e-06, "loss": 0.4844, "num_tokens": 8389002880.0, "step": 2009 }, { "epoch": 4.020035061357375, "grad_norm": 0.08945372665534287, "learning_rate": 7.465212965495807e-06, "loss": 0.4849, "num_tokens": 8393171235.0, "step": 2010 }, { "epoch": 4.022038567493113, "grad_norm": 0.10023886596333127, "learning_rate": 7.451441274263767e-06, "loss": 0.4647, "num_tokens": 8397365539.0, "step": 2011 }, { "epoch": 4.02404207362885, "grad_norm": 0.09454273600521144, "learning_rate": 7.437694101250949e-06, "loss": 0.4691, "num_tokens": 8401522425.0, "step": 2012 }, { "epoch": 4.026045579764588, "grad_norm": 0.09855166142780276, "learning_rate": 7.423971469625024e-06, "loss": 0.4607, "num_tokens": 8405707374.0, "step": 2013 }, { "epoch": 4.028049085900325, "grad_norm": 0.0886923957455581, "learning_rate": 7.41027340251231e-06, "loss": 0.4559, "num_tokens": 8409901678.0, "step": 2014 }, { "epoch": 4.030052592036063, "grad_norm": 0.09170788802665379, "learning_rate": 7.396599922997708e-06, "loss": 0.487, "num_tokens": 8414095674.0, "step": 2015 }, { "epoch": 4.0320560981718, "grad_norm": 0.08814383604325192, "learning_rate": 7.382951054124705e-06, "loss": 0.4637, "num_tokens": 8418289978.0, "step": 2016 }, { "epoch": 4.034059604307538, "grad_norm": 0.09865006133100851, "learning_rate": 7.3693268188952905e-06, "loss": 0.4687, "num_tokens": 8422457801.0, "step": 2017 }, { "epoch": 4.036063110443275, "grad_norm": 0.0901809939595168, "learning_rate": 7.355727240269959e-06, "loss": 0.4804, "num_tokens": 8426621370.0, "step": 2018 }, { "epoch": 4.038066616579013, "grad_norm": 0.09135545296853169, "learning_rate": 7.342152341167645e-06, "loss": 0.4801, "num_tokens": 8430815674.0, "step": 2019 }, { "epoch": 4.04007012271475, "grad_norm": 0.08713558974800988, "learning_rate": 7.328602144465684e-06, "loss": 0.4732, "num_tokens": 8434987942.0, "step": 2020 }, { "epoch": 4.042073628850488, "grad_norm": 0.08874310832831414, "learning_rate": 7.315076672999788e-06, "loss": 0.4721, "num_tokens": 8439182246.0, "step": 2021 }, { "epoch": 4.044077134986226, "grad_norm": 0.10895464404636764, "learning_rate": 7.30157594956401e-06, "loss": 0.4692, "num_tokens": 8443372207.0, "step": 2022 }, { "epoch": 4.046080641121963, "grad_norm": 0.0890986766127645, "learning_rate": 7.288099996910676e-06, "loss": 0.4797, "num_tokens": 8447566511.0, "step": 2023 }, { "epoch": 4.048084147257701, "grad_norm": 0.08706172882317609, "learning_rate": 7.274648837750382e-06, "loss": 0.4764, "num_tokens": 8451740136.0, "step": 2024 }, { "epoch": 4.0500876533934385, "grad_norm": 0.09408593162488674, "learning_rate": 7.2612224947519295e-06, "loss": 0.4787, "num_tokens": 8455906471.0, "step": 2025 }, { "epoch": 4.052091159529176, "grad_norm": 0.09364633676834641, "learning_rate": 7.247820990542309e-06, "loss": 0.4591, "num_tokens": 8460097741.0, "step": 2026 }, { "epoch": 4.0540946656649135, "grad_norm": 0.08180385328596348, "learning_rate": 7.234444347706646e-06, "loss": 0.4636, "num_tokens": 8464277766.0, "step": 2027 }, { "epoch": 4.0560981718006515, "grad_norm": 0.08422329275666093, "learning_rate": 7.221092588788159e-06, "loss": 0.4724, "num_tokens": 8468454923.0, "step": 2028 }, { "epoch": 4.0581016779363885, "grad_norm": 0.08584170624979878, "learning_rate": 7.207765736288147e-06, "loss": 0.4702, "num_tokens": 8472649227.0, "step": 2029 }, { "epoch": 4.0601051840721265, "grad_norm": 0.0866407901648279, "learning_rate": 7.194463812665928e-06, "loss": 0.4684, "num_tokens": 8476843531.0, "step": 2030 }, { "epoch": 4.062108690207864, "grad_norm": 0.08355506092469632, "learning_rate": 7.181186840338799e-06, "loss": 0.4702, "num_tokens": 8481032211.0, "step": 2031 }, { "epoch": 4.0641121963436015, "grad_norm": 0.08646236886343653, "learning_rate": 7.1679348416820225e-06, "loss": 0.4758, "num_tokens": 8485226515.0, "step": 2032 }, { "epoch": 4.066115702479339, "grad_norm": 0.08352498475832776, "learning_rate": 7.154707839028767e-06, "loss": 0.4744, "num_tokens": 8489411693.0, "step": 2033 }, { "epoch": 4.068119208615077, "grad_norm": 0.08923454648211505, "learning_rate": 7.141505854670069e-06, "loss": 0.4624, "num_tokens": 8493577483.0, "step": 2034 }, { "epoch": 4.070122714750814, "grad_norm": 0.0809240593700823, "learning_rate": 7.128328910854818e-06, "loss": 0.4551, "num_tokens": 8497749077.0, "step": 2035 }, { "epoch": 4.072126220886552, "grad_norm": 0.08668660472310685, "learning_rate": 7.115177029789686e-06, "loss": 0.4784, "num_tokens": 8501926296.0, "step": 2036 }, { "epoch": 4.074129727022289, "grad_norm": 0.09263192173034507, "learning_rate": 7.102050233639124e-06, "loss": 0.4823, "num_tokens": 8506120600.0, "step": 2037 }, { "epoch": 4.076133233158027, "grad_norm": 0.08904423988322492, "learning_rate": 7.088948544525292e-06, "loss": 0.4779, "num_tokens": 8510263687.0, "step": 2038 }, { "epoch": 4.078136739293764, "grad_norm": 0.08963680249254223, "learning_rate": 7.075871984528049e-06, "loss": 0.4736, "num_tokens": 8514448068.0, "step": 2039 }, { "epoch": 4.080140245429502, "grad_norm": 0.08231048570319882, "learning_rate": 7.062820575684906e-06, "loss": 0.4719, "num_tokens": 8518642372.0, "step": 2040 }, { "epoch": 4.082143751565239, "grad_norm": 0.11034434951529258, "learning_rate": 7.049794339990985e-06, "loss": 0.4623, "num_tokens": 8522828346.0, "step": 2041 }, { "epoch": 4.084147257700977, "grad_norm": 0.0833350793797068, "learning_rate": 7.036793299398974e-06, "loss": 0.4591, "num_tokens": 8527022650.0, "step": 2042 }, { "epoch": 4.086150763836714, "grad_norm": 0.08663575626757512, "learning_rate": 7.02381747581912e-06, "loss": 0.4812, "num_tokens": 8531216954.0, "step": 2043 }, { "epoch": 4.088154269972452, "grad_norm": 0.09608868123521538, "learning_rate": 7.0108668911191525e-06, "loss": 0.4678, "num_tokens": 8535383779.0, "step": 2044 }, { "epoch": 4.09015777610819, "grad_norm": 0.08569599178071419, "learning_rate": 6.997941567124287e-06, "loss": 0.4532, "num_tokens": 8539578083.0, "step": 2045 }, { "epoch": 4.092161282243927, "grad_norm": 0.08357565843777122, "learning_rate": 6.985041525617145e-06, "loss": 0.4683, "num_tokens": 8543772387.0, "step": 2046 }, { "epoch": 4.094164788379665, "grad_norm": 0.09081287730261381, "learning_rate": 6.97216678833776e-06, "loss": 0.4672, "num_tokens": 8547962008.0, "step": 2047 }, { "epoch": 4.096168294515402, "grad_norm": 0.09762122120617121, "learning_rate": 6.959317376983512e-06, "loss": 0.4644, "num_tokens": 8552140935.0, "step": 2048 }, { "epoch": 4.09817180065114, "grad_norm": 0.09127221952647085, "learning_rate": 6.946493313209106e-06, "loss": 0.4741, "num_tokens": 8556335239.0, "step": 2049 }, { "epoch": 4.100175306786877, "grad_norm": 0.10151795534435937, "learning_rate": 6.933694618626516e-06, "loss": 0.4793, "num_tokens": 8560515811.0, "step": 2050 }, { "epoch": 4.102178812922615, "grad_norm": 0.08522765132964394, "learning_rate": 6.920921314804982e-06, "loss": 0.4808, "num_tokens": 8564710115.0, "step": 2051 }, { "epoch": 4.104182319058352, "grad_norm": 0.07992721171227934, "learning_rate": 6.908173423270934e-06, "loss": 0.4793, "num_tokens": 8568904419.0, "step": 2052 }, { "epoch": 4.10618582519409, "grad_norm": 0.09338572586913059, "learning_rate": 6.895450965507993e-06, "loss": 0.4666, "num_tokens": 8573072628.0, "step": 2053 }, { "epoch": 4.108189331329827, "grad_norm": 0.08194030901304027, "learning_rate": 6.882753962956901e-06, "loss": 0.4595, "num_tokens": 8577266932.0, "step": 2054 }, { "epoch": 4.110192837465565, "grad_norm": 0.07751883345253137, "learning_rate": 6.870082437015514e-06, "loss": 0.473, "num_tokens": 8581461236.0, "step": 2055 }, { "epoch": 4.112196343601302, "grad_norm": 0.08329325662281038, "learning_rate": 6.857436409038738e-06, "loss": 0.4731, "num_tokens": 8585655540.0, "step": 2056 }, { "epoch": 4.11419984973704, "grad_norm": 0.0875223411266965, "learning_rate": 6.844815900338535e-06, "loss": 0.4762, "num_tokens": 8589849844.0, "step": 2057 }, { "epoch": 4.116203355872777, "grad_norm": 0.08442171613847602, "learning_rate": 6.832220932183833e-06, "loss": 0.4719, "num_tokens": 8594040169.0, "step": 2058 }, { "epoch": 4.118206862008515, "grad_norm": 0.08016736047288248, "learning_rate": 6.81965152580053e-06, "loss": 0.4668, "num_tokens": 8598223304.0, "step": 2059 }, { "epoch": 4.120210368144252, "grad_norm": 0.08302904411848532, "learning_rate": 6.807107702371441e-06, "loss": 0.4802, "num_tokens": 8602416932.0, "step": 2060 }, { "epoch": 4.12221387427999, "grad_norm": 0.08536472432513685, "learning_rate": 6.794589483036276e-06, "loss": 0.4769, "num_tokens": 8606611236.0, "step": 2061 }, { "epoch": 4.124217380415727, "grad_norm": 0.09040748739259405, "learning_rate": 6.782096888891576e-06, "loss": 0.4736, "num_tokens": 8610805540.0, "step": 2062 }, { "epoch": 4.126220886551465, "grad_norm": 0.08232118626000436, "learning_rate": 6.7696299409907254e-06, "loss": 0.4677, "num_tokens": 8614981503.0, "step": 2063 }, { "epoch": 4.128224392687202, "grad_norm": 0.0905470174629063, "learning_rate": 6.757188660343854e-06, "loss": 0.4737, "num_tokens": 8619175807.0, "step": 2064 }, { "epoch": 4.13022789882294, "grad_norm": 0.08407722016981604, "learning_rate": 6.744773067917864e-06, "loss": 0.4683, "num_tokens": 8623370111.0, "step": 2065 }, { "epoch": 4.132231404958677, "grad_norm": 0.09565450139715861, "learning_rate": 6.732383184636353e-06, "loss": 0.4615, "num_tokens": 8627564415.0, "step": 2066 }, { "epoch": 4.134234911094415, "grad_norm": 0.09246998694343107, "learning_rate": 6.7200190313795946e-06, "loss": 0.4494, "num_tokens": 8631758719.0, "step": 2067 }, { "epoch": 4.136238417230153, "grad_norm": 0.08704253480728448, "learning_rate": 6.707680628984497e-06, "loss": 0.476, "num_tokens": 8635925115.0, "step": 2068 }, { "epoch": 4.13824192336589, "grad_norm": 0.0828001293917864, "learning_rate": 6.695367998244575e-06, "loss": 0.4696, "num_tokens": 8640094448.0, "step": 2069 }, { "epoch": 4.140245429501628, "grad_norm": 0.08604511533894532, "learning_rate": 6.68308115990991e-06, "loss": 0.4627, "num_tokens": 8644288752.0, "step": 2070 }, { "epoch": 4.142248935637365, "grad_norm": 0.07983011893946382, "learning_rate": 6.670820134687119e-06, "loss": 0.4673, "num_tokens": 8648483056.0, "step": 2071 }, { "epoch": 4.144252441773103, "grad_norm": 0.07515433062402445, "learning_rate": 6.65858494323931e-06, "loss": 0.4595, "num_tokens": 8652660766.0, "step": 2072 }, { "epoch": 4.14625594790884, "grad_norm": 0.08154981549317532, "learning_rate": 6.646375606186063e-06, "loss": 0.4714, "num_tokens": 8656827230.0, "step": 2073 }, { "epoch": 4.148259454044578, "grad_norm": 0.08307122561320732, "learning_rate": 6.6341921441033704e-06, "loss": 0.4876, "num_tokens": 8661021534.0, "step": 2074 }, { "epoch": 4.150262960180315, "grad_norm": 0.08481854439374847, "learning_rate": 6.622034577523648e-06, "loss": 0.4703, "num_tokens": 8665187805.0, "step": 2075 }, { "epoch": 4.152266466316053, "grad_norm": 0.08224931815287072, "learning_rate": 6.609902926935641e-06, "loss": 0.472, "num_tokens": 8669382109.0, "step": 2076 }, { "epoch": 4.15426997245179, "grad_norm": 0.07909937600820736, "learning_rate": 6.597797212784437e-06, "loss": 0.4708, "num_tokens": 8673576413.0, "step": 2077 }, { "epoch": 4.156273478587528, "grad_norm": 0.07671946971923797, "learning_rate": 6.585717455471401e-06, "loss": 0.4677, "num_tokens": 8677770717.0, "step": 2078 }, { "epoch": 4.1582769847232655, "grad_norm": 0.08360358562822127, "learning_rate": 6.573663675354166e-06, "loss": 0.4634, "num_tokens": 8681965021.0, "step": 2079 }, { "epoch": 4.160280490859003, "grad_norm": 0.08506691523974552, "learning_rate": 6.561635892746574e-06, "loss": 0.4908, "num_tokens": 8686155250.0, "step": 2080 }, { "epoch": 4.1622839969947405, "grad_norm": 0.08433194014274173, "learning_rate": 6.549634127918671e-06, "loss": 0.463, "num_tokens": 8690328003.0, "step": 2081 }, { "epoch": 4.1642875031304785, "grad_norm": 0.08018156722035191, "learning_rate": 6.537658401096639e-06, "loss": 0.4679, "num_tokens": 8694522307.0, "step": 2082 }, { "epoch": 4.1662910092662155, "grad_norm": 0.08063833895762976, "learning_rate": 6.525708732462786e-06, "loss": 0.4694, "num_tokens": 8698694474.0, "step": 2083 }, { "epoch": 4.1682945154019535, "grad_norm": 0.08373834665405112, "learning_rate": 6.5137851421555075e-06, "loss": 0.4661, "num_tokens": 8702888778.0, "step": 2084 }, { "epoch": 4.170298021537691, "grad_norm": 0.08913489189276702, "learning_rate": 6.50188765026925e-06, "loss": 0.4739, "num_tokens": 8707050486.0, "step": 2085 }, { "epoch": 4.1723015276734285, "grad_norm": 0.08883463928999462, "learning_rate": 6.490016276854471e-06, "loss": 0.4669, "num_tokens": 8711236251.0, "step": 2086 }, { "epoch": 4.174305033809166, "grad_norm": 0.08111408475589907, "learning_rate": 6.478171041917619e-06, "loss": 0.4676, "num_tokens": 8715430555.0, "step": 2087 }, { "epoch": 4.176308539944904, "grad_norm": 0.09460837117429698, "learning_rate": 6.466351965421082e-06, "loss": 0.4737, "num_tokens": 8719624859.0, "step": 2088 }, { "epoch": 4.1783120460806416, "grad_norm": 0.08905684079283104, "learning_rate": 6.454559067283175e-06, "loss": 0.4678, "num_tokens": 8723810275.0, "step": 2089 }, { "epoch": 4.180315552216379, "grad_norm": 0.08290094734186815, "learning_rate": 6.442792367378092e-06, "loss": 0.4625, "num_tokens": 8727983136.0, "step": 2090 }, { "epoch": 4.182319058352117, "grad_norm": 0.08388080652518615, "learning_rate": 6.4310518855358695e-06, "loss": 0.4716, "num_tokens": 8732176796.0, "step": 2091 }, { "epoch": 4.184322564487854, "grad_norm": 0.09758815771313374, "learning_rate": 6.419337641542371e-06, "loss": 0.481, "num_tokens": 8736345465.0, "step": 2092 }, { "epoch": 4.186326070623592, "grad_norm": 0.08498563186655321, "learning_rate": 6.407649655139232e-06, "loss": 0.4698, "num_tokens": 8740523288.0, "step": 2093 }, { "epoch": 4.188329576759329, "grad_norm": 0.08122964092184277, "learning_rate": 6.395987946023843e-06, "loss": 0.4673, "num_tokens": 8744653643.0, "step": 2094 }, { "epoch": 4.190333082895067, "grad_norm": 0.0926210691440566, "learning_rate": 6.384352533849306e-06, "loss": 0.4582, "num_tokens": 8748826093.0, "step": 2095 }, { "epoch": 4.192336589030804, "grad_norm": 0.08413001430231257, "learning_rate": 6.372743438224412e-06, "loss": 0.4738, "num_tokens": 8752994634.0, "step": 2096 }, { "epoch": 4.194340095166542, "grad_norm": 0.10261127445260772, "learning_rate": 6.361160678713592e-06, "loss": 0.4732, "num_tokens": 8757181905.0, "step": 2097 }, { "epoch": 4.196343601302279, "grad_norm": 0.08785548004115473, "learning_rate": 6.349604274836907e-06, "loss": 0.4663, "num_tokens": 8761376209.0, "step": 2098 }, { "epoch": 4.198347107438017, "grad_norm": 0.08883622735654985, "learning_rate": 6.338074246069982e-06, "loss": 0.4568, "num_tokens": 8765570513.0, "step": 2099 }, { "epoch": 4.200350613573754, "grad_norm": 0.08855677354989158, "learning_rate": 6.326570611844014e-06, "loss": 0.4691, "num_tokens": 8769764817.0, "step": 2100 }, { "epoch": 4.202354119709492, "grad_norm": 0.08225978596859229, "learning_rate": 6.315093391545704e-06, "loss": 0.4671, "num_tokens": 8773954106.0, "step": 2101 }, { "epoch": 4.204357625845229, "grad_norm": 0.09216070471916378, "learning_rate": 6.303642604517242e-06, "loss": 0.4672, "num_tokens": 8778146541.0, "step": 2102 }, { "epoch": 4.206361131980967, "grad_norm": 0.08297126939399038, "learning_rate": 6.292218270056277e-06, "loss": 0.4554, "num_tokens": 8782340845.0, "step": 2103 }, { "epoch": 4.208364638116704, "grad_norm": 0.08626024756478334, "learning_rate": 6.280820407415874e-06, "loss": 0.4573, "num_tokens": 8786535149.0, "step": 2104 }, { "epoch": 4.210368144252442, "grad_norm": 0.0804496784469706, "learning_rate": 6.269449035804477e-06, "loss": 0.48, "num_tokens": 8790729453.0, "step": 2105 }, { "epoch": 4.212371650388179, "grad_norm": 0.08383322597220026, "learning_rate": 6.258104174385905e-06, "loss": 0.4637, "num_tokens": 8794923757.0, "step": 2106 }, { "epoch": 4.214375156523917, "grad_norm": 0.08364618375124896, "learning_rate": 6.246785842279278e-06, "loss": 0.4701, "num_tokens": 8799109913.0, "step": 2107 }, { "epoch": 4.216378662659654, "grad_norm": 0.08228781633377191, "learning_rate": 6.235494058559024e-06, "loss": 0.4635, "num_tokens": 8803285046.0, "step": 2108 }, { "epoch": 4.218382168795392, "grad_norm": 0.08406311950573944, "learning_rate": 6.224228842254818e-06, "loss": 0.4722, "num_tokens": 8807462005.0, "step": 2109 }, { "epoch": 4.22038567493113, "grad_norm": 0.08480164413447025, "learning_rate": 6.212990212351578e-06, "loss": 0.4656, "num_tokens": 8811651758.0, "step": 2110 }, { "epoch": 4.222389181066867, "grad_norm": 0.07586919219606678, "learning_rate": 6.201778187789399e-06, "loss": 0.4709, "num_tokens": 8815824623.0, "step": 2111 }, { "epoch": 4.224392687202605, "grad_norm": 0.07849799736699788, "learning_rate": 6.19059278746355e-06, "loss": 0.4752, "num_tokens": 8820018927.0, "step": 2112 }, { "epoch": 4.226396193338342, "grad_norm": 0.08132403168660726, "learning_rate": 6.17943403022442e-06, "loss": 0.4792, "num_tokens": 8824197791.0, "step": 2113 }, { "epoch": 4.22839969947408, "grad_norm": 0.08866491494813122, "learning_rate": 6.1683019348775155e-06, "loss": 0.4805, "num_tokens": 8828392095.0, "step": 2114 }, { "epoch": 4.230403205609817, "grad_norm": 0.08391817481860908, "learning_rate": 6.157196520183387e-06, "loss": 0.4823, "num_tokens": 8832586399.0, "step": 2115 }, { "epoch": 4.232406711745555, "grad_norm": 0.0761071937029762, "learning_rate": 6.1461178048576455e-06, "loss": 0.4684, "num_tokens": 8836780703.0, "step": 2116 }, { "epoch": 4.234410217881292, "grad_norm": 0.08703698203591631, "learning_rate": 6.135065807570888e-06, "loss": 0.4814, "num_tokens": 8840907761.0, "step": 2117 }, { "epoch": 4.23641372401703, "grad_norm": 0.08493323574296016, "learning_rate": 6.1240405469486875e-06, "loss": 0.4695, "num_tokens": 8845102065.0, "step": 2118 }, { "epoch": 4.238417230152767, "grad_norm": 0.08667678880516552, "learning_rate": 6.113042041571569e-06, "loss": 0.457, "num_tokens": 8849296369.0, "step": 2119 }, { "epoch": 4.240420736288505, "grad_norm": 0.08551710143458144, "learning_rate": 6.102070309974962e-06, "loss": 0.4751, "num_tokens": 8853490673.0, "step": 2120 }, { "epoch": 4.242424242424242, "grad_norm": 0.07815538689190006, "learning_rate": 6.091125370649167e-06, "loss": 0.4657, "num_tokens": 8857684977.0, "step": 2121 }, { "epoch": 4.24442774855998, "grad_norm": 0.08012792672307713, "learning_rate": 6.080207242039344e-06, "loss": 0.4658, "num_tokens": 8861878704.0, "step": 2122 }, { "epoch": 4.246431254695717, "grad_norm": 0.08122613293871575, "learning_rate": 6.06931594254546e-06, "loss": 0.4688, "num_tokens": 8866035846.0, "step": 2123 }, { "epoch": 4.248434760831455, "grad_norm": 0.07935437430247774, "learning_rate": 6.05845149052228e-06, "loss": 0.4727, "num_tokens": 8870230150.0, "step": 2124 }, { "epoch": 4.250438266967192, "grad_norm": 0.08377979309755994, "learning_rate": 6.047613904279308e-06, "loss": 0.4785, "num_tokens": 8874424454.0, "step": 2125 }, { "epoch": 4.25244177310293, "grad_norm": 0.08173369361191418, "learning_rate": 6.036803202080787e-06, "loss": 0.4717, "num_tokens": 8878594670.0, "step": 2126 }, { "epoch": 4.254445279238667, "grad_norm": 0.07816512589944356, "learning_rate": 6.026019402145638e-06, "loss": 0.4648, "num_tokens": 8882768503.0, "step": 2127 }, { "epoch": 4.256448785374405, "grad_norm": 0.0835517709427554, "learning_rate": 6.015262522647464e-06, "loss": 0.4713, "num_tokens": 8886953126.0, "step": 2128 }, { "epoch": 4.258452291510142, "grad_norm": 0.08841347453905202, "learning_rate": 6.004532581714482e-06, "loss": 0.4755, "num_tokens": 8891147430.0, "step": 2129 }, { "epoch": 4.26045579764588, "grad_norm": 0.08908677548314838, "learning_rate": 5.993829597429523e-06, "loss": 0.462, "num_tokens": 8895341734.0, "step": 2130 }, { "epoch": 4.262459303781618, "grad_norm": 0.08254461282842569, "learning_rate": 5.983153587829974e-06, "loss": 0.4663, "num_tokens": 8899506799.0, "step": 2131 }, { "epoch": 4.264462809917355, "grad_norm": 0.07510655107115544, "learning_rate": 5.97250457090778e-06, "loss": 0.4627, "num_tokens": 8903701103.0, "step": 2132 }, { "epoch": 4.2664663160530925, "grad_norm": 0.0898258825397736, "learning_rate": 5.96188256460938e-06, "loss": 0.4708, "num_tokens": 8907895407.0, "step": 2133 }, { "epoch": 4.26846982218883, "grad_norm": 0.08434496170288293, "learning_rate": 5.951287586835708e-06, "loss": 0.4704, "num_tokens": 8912089711.0, "step": 2134 }, { "epoch": 4.270473328324568, "grad_norm": 0.08381699629730592, "learning_rate": 5.9407196554421355e-06, "loss": 0.4709, "num_tokens": 8916284015.0, "step": 2135 }, { "epoch": 4.2724768344603055, "grad_norm": 0.08816972595539246, "learning_rate": 5.930178788238454e-06, "loss": 0.4711, "num_tokens": 8920478209.0, "step": 2136 }, { "epoch": 4.274480340596043, "grad_norm": 0.09555416125537759, "learning_rate": 5.919665002988858e-06, "loss": 0.4728, "num_tokens": 8924672513.0, "step": 2137 }, { "epoch": 4.2764838467317805, "grad_norm": 0.08162165334815429, "learning_rate": 5.909178317411891e-06, "loss": 0.4693, "num_tokens": 8928844017.0, "step": 2138 }, { "epoch": 4.2784873528675185, "grad_norm": 0.08246254132084145, "learning_rate": 5.898718749180422e-06, "loss": 0.4677, "num_tokens": 8933033658.0, "step": 2139 }, { "epoch": 4.2804908590032555, "grad_norm": 0.08296829951372184, "learning_rate": 5.888286315921631e-06, "loss": 0.4791, "num_tokens": 8937203766.0, "step": 2140 }, { "epoch": 4.2824943651389935, "grad_norm": 0.08457655027456183, "learning_rate": 5.8778810352169565e-06, "loss": 0.4669, "num_tokens": 8941391981.0, "step": 2141 }, { "epoch": 4.284497871274731, "grad_norm": 0.07999542672581886, "learning_rate": 5.867502924602093e-06, "loss": 0.4707, "num_tokens": 8945567221.0, "step": 2142 }, { "epoch": 4.2865013774104685, "grad_norm": 0.07978656501943188, "learning_rate": 5.857152001566931e-06, "loss": 0.4615, "num_tokens": 8949723441.0, "step": 2143 }, { "epoch": 4.288504883546206, "grad_norm": 0.07922108294686568, "learning_rate": 5.846828283555553e-06, "loss": 0.4691, "num_tokens": 8953886115.0, "step": 2144 }, { "epoch": 4.290508389681944, "grad_norm": 0.08701306931013925, "learning_rate": 5.836531787966181e-06, "loss": 0.4658, "num_tokens": 8958050973.0, "step": 2145 }, { "epoch": 4.292511895817681, "grad_norm": 0.08617401226382922, "learning_rate": 5.8262625321511815e-06, "loss": 0.4634, "num_tokens": 8962213773.0, "step": 2146 }, { "epoch": 4.294515401953419, "grad_norm": 0.07850635707303556, "learning_rate": 5.816020533416994e-06, "loss": 0.4808, "num_tokens": 8966351864.0, "step": 2147 }, { "epoch": 4.296518908089156, "grad_norm": 0.08360309134859675, "learning_rate": 5.805805809024134e-06, "loss": 0.4741, "num_tokens": 8970546168.0, "step": 2148 }, { "epoch": 4.298522414224894, "grad_norm": 0.0763904850013944, "learning_rate": 5.795618376187143e-06, "loss": 0.4706, "num_tokens": 8974719667.0, "step": 2149 }, { "epoch": 4.300525920360631, "grad_norm": 0.07882727411826523, "learning_rate": 5.785458252074583e-06, "loss": 0.462, "num_tokens": 8978903704.0, "step": 2150 }, { "epoch": 4.302529426496369, "grad_norm": 0.08177346165680317, "learning_rate": 5.775325453808975e-06, "loss": 0.4773, "num_tokens": 8983078771.0, "step": 2151 }, { "epoch": 4.304532932632106, "grad_norm": 0.07959794556356713, "learning_rate": 5.765219998466807e-06, "loss": 0.4704, "num_tokens": 8987273075.0, "step": 2152 }, { "epoch": 4.306536438767844, "grad_norm": 0.08561026483114416, "learning_rate": 5.755141903078477e-06, "loss": 0.4645, "num_tokens": 8991448980.0, "step": 2153 }, { "epoch": 4.308539944903581, "grad_norm": 0.0864807657719947, "learning_rate": 5.7450911846282745e-06, "loss": 0.4726, "num_tokens": 8995643284.0, "step": 2154 }, { "epoch": 4.310543451039319, "grad_norm": 0.09011256741732232, "learning_rate": 5.735067860054349e-06, "loss": 0.4668, "num_tokens": 8999837588.0, "step": 2155 }, { "epoch": 4.312546957175057, "grad_norm": 0.07943550701407102, "learning_rate": 5.725071946248697e-06, "loss": 0.4733, "num_tokens": 9004026584.0, "step": 2156 }, { "epoch": 4.314550463310794, "grad_norm": 0.08275534046592947, "learning_rate": 5.715103460057104e-06, "loss": 0.4716, "num_tokens": 9008193477.0, "step": 2157 }, { "epoch": 4.316553969446532, "grad_norm": 0.0885985313090145, "learning_rate": 5.7051624182791455e-06, "loss": 0.4779, "num_tokens": 9012387781.0, "step": 2158 }, { "epoch": 4.318557475582269, "grad_norm": 0.07951892516673588, "learning_rate": 5.695248837668132e-06, "loss": 0.4651, "num_tokens": 9016551364.0, "step": 2159 }, { "epoch": 4.320560981718007, "grad_norm": 0.08035689283460222, "learning_rate": 5.685362734931111e-06, "loss": 0.4737, "num_tokens": 9020717048.0, "step": 2160 }, { "epoch": 4.322564487853744, "grad_norm": 0.0809671432125536, "learning_rate": 5.67550412672881e-06, "loss": 0.4605, "num_tokens": 9024907051.0, "step": 2161 }, { "epoch": 4.324567993989482, "grad_norm": 0.09049645813936312, "learning_rate": 5.665673029675626e-06, "loss": 0.4656, "num_tokens": 9029075661.0, "step": 2162 }, { "epoch": 4.326571500125219, "grad_norm": 0.08079014620428272, "learning_rate": 5.655869460339592e-06, "loss": 0.476, "num_tokens": 9033264215.0, "step": 2163 }, { "epoch": 4.328575006260957, "grad_norm": 0.08012451782061875, "learning_rate": 5.6460934352423506e-06, "loss": 0.4703, "num_tokens": 9037458519.0, "step": 2164 }, { "epoch": 4.330578512396694, "grad_norm": 0.07657118096168761, "learning_rate": 5.636344970859116e-06, "loss": 0.4557, "num_tokens": 9041632292.0, "step": 2165 }, { "epoch": 4.332582018532432, "grad_norm": 0.08091236040531403, "learning_rate": 5.626624083618669e-06, "loss": 0.4725, "num_tokens": 9045825167.0, "step": 2166 }, { "epoch": 4.334585524668169, "grad_norm": 0.08162299025643084, "learning_rate": 5.616930789903311e-06, "loss": 0.4646, "num_tokens": 9050019471.0, "step": 2167 }, { "epoch": 4.336589030803907, "grad_norm": 0.08017187679076904, "learning_rate": 5.6072651060488305e-06, "loss": 0.4695, "num_tokens": 9054210745.0, "step": 2168 }, { "epoch": 4.338592536939644, "grad_norm": 0.0819530687827271, "learning_rate": 5.597627048344504e-06, "loss": 0.4709, "num_tokens": 9058383038.0, "step": 2169 }, { "epoch": 4.340596043075382, "grad_norm": 0.07885944639998968, "learning_rate": 5.588016633033033e-06, "loss": 0.4745, "num_tokens": 9062535064.0, "step": 2170 }, { "epoch": 4.342599549211119, "grad_norm": 0.08237888618526173, "learning_rate": 5.578433876310546e-06, "loss": 0.4698, "num_tokens": 9066718586.0, "step": 2171 }, { "epoch": 4.344603055346857, "grad_norm": 0.08657868844191557, "learning_rate": 5.5688787943265525e-06, "loss": 0.4681, "num_tokens": 9070912890.0, "step": 2172 }, { "epoch": 4.346606561482594, "grad_norm": 0.08022175229155408, "learning_rate": 5.55935140318393e-06, "loss": 0.4574, "num_tokens": 9075107194.0, "step": 2173 }, { "epoch": 4.348610067618332, "grad_norm": 0.08369361274747834, "learning_rate": 5.549851718938881e-06, "loss": 0.4717, "num_tokens": 9079278513.0, "step": 2174 }, { "epoch": 4.350613573754069, "grad_norm": 0.08115789367693907, "learning_rate": 5.540379757600923e-06, "loss": 0.4708, "num_tokens": 9083448925.0, "step": 2175 }, { "epoch": 4.352617079889807, "grad_norm": 0.08438179929204503, "learning_rate": 5.530935535132844e-06, "loss": 0.4706, "num_tokens": 9087643229.0, "step": 2176 }, { "epoch": 4.354620586025545, "grad_norm": 0.08612704453260583, "learning_rate": 5.521519067450692e-06, "loss": 0.4675, "num_tokens": 9091837533.0, "step": 2177 }, { "epoch": 4.356624092161282, "grad_norm": 0.09289058291232007, "learning_rate": 5.512130370423736e-06, "loss": 0.4671, "num_tokens": 9096027455.0, "step": 2178 }, { "epoch": 4.35862759829702, "grad_norm": 0.08321675959724054, "learning_rate": 5.50276945987445e-06, "loss": 0.4561, "num_tokens": 9100216640.0, "step": 2179 }, { "epoch": 4.360631104432757, "grad_norm": 0.08047103297187715, "learning_rate": 5.493436351578468e-06, "loss": 0.4768, "num_tokens": 9104410944.0, "step": 2180 }, { "epoch": 4.362634610568495, "grad_norm": 0.08131246704949072, "learning_rate": 5.484131061264584e-06, "loss": 0.4728, "num_tokens": 9108591527.0, "step": 2181 }, { "epoch": 4.364638116704232, "grad_norm": 0.083810004640177, "learning_rate": 5.4748536046147025e-06, "loss": 0.4692, "num_tokens": 9112778826.0, "step": 2182 }, { "epoch": 4.36664162283997, "grad_norm": 0.07878001654641517, "learning_rate": 5.465603997263829e-06, "loss": 0.472, "num_tokens": 9116973130.0, "step": 2183 }, { "epoch": 4.368645128975707, "grad_norm": 0.09050014281337475, "learning_rate": 5.4563822548000245e-06, "loss": 0.4701, "num_tokens": 9121130958.0, "step": 2184 }, { "epoch": 4.370648635111445, "grad_norm": 0.07888461306526846, "learning_rate": 5.4471883927643985e-06, "loss": 0.4745, "num_tokens": 9125298361.0, "step": 2185 }, { "epoch": 4.372652141247182, "grad_norm": 0.07866516919550343, "learning_rate": 5.438022426651067e-06, "loss": 0.4587, "num_tokens": 9129462002.0, "step": 2186 }, { "epoch": 4.37465564738292, "grad_norm": 0.09779895222065679, "learning_rate": 5.428884371907147e-06, "loss": 0.4744, "num_tokens": 9133656306.0, "step": 2187 }, { "epoch": 4.376659153518657, "grad_norm": 0.09107884976538427, "learning_rate": 5.4197742439327e-06, "loss": 0.4782, "num_tokens": 9137829230.0, "step": 2188 }, { "epoch": 4.378662659654395, "grad_norm": 0.08067017249399527, "learning_rate": 5.4106920580807395e-06, "loss": 0.477, "num_tokens": 9142005549.0, "step": 2189 }, { "epoch": 4.3806661657901325, "grad_norm": 0.08732423250177364, "learning_rate": 5.401637829657174e-06, "loss": 0.4658, "num_tokens": 9146199853.0, "step": 2190 }, { "epoch": 4.38266967192587, "grad_norm": 0.08991131006079148, "learning_rate": 5.3926115739208106e-06, "loss": 0.4667, "num_tokens": 9150386695.0, "step": 2191 }, { "epoch": 4.3846731780616075, "grad_norm": 0.08314720084758544, "learning_rate": 5.383613306083308e-06, "loss": 0.4665, "num_tokens": 9154541885.0, "step": 2192 }, { "epoch": 4.3866766841973455, "grad_norm": 0.07938401827348662, "learning_rate": 5.374643041309158e-06, "loss": 0.4767, "num_tokens": 9158736189.0, "step": 2193 }, { "epoch": 4.3886801903330825, "grad_norm": 0.08466820531406007, "learning_rate": 5.365700794715658e-06, "loss": 0.4718, "num_tokens": 9162930493.0, "step": 2194 }, { "epoch": 4.3906836964688205, "grad_norm": 0.08528380501195103, "learning_rate": 5.356786581372892e-06, "loss": 0.4609, "num_tokens": 9167124797.0, "step": 2195 }, { "epoch": 4.392687202604558, "grad_norm": 0.0826268953857568, "learning_rate": 5.347900416303694e-06, "loss": 0.4636, "num_tokens": 9171290236.0, "step": 2196 }, { "epoch": 4.3946907087402955, "grad_norm": 0.0782749018836659, "learning_rate": 5.339042314483636e-06, "loss": 0.4601, "num_tokens": 9175484540.0, "step": 2197 }, { "epoch": 4.3966942148760335, "grad_norm": 0.07676582594778067, "learning_rate": 5.330212290840989e-06, "loss": 0.4819, "num_tokens": 9179672595.0, "step": 2198 }, { "epoch": 4.398697721011771, "grad_norm": 0.09316362522201545, "learning_rate": 5.321410360256714e-06, "loss": 0.4757, "num_tokens": 9183866899.0, "step": 2199 }, { "epoch": 4.4007012271475086, "grad_norm": 0.0804111508495427, "learning_rate": 5.312636537564418e-06, "loss": 0.4615, "num_tokens": 9188061203.0, "step": 2200 }, { "epoch": 4.402704733283246, "grad_norm": 0.08474867963801357, "learning_rate": 5.3038908375503485e-06, "loss": 0.4867, "num_tokens": 9192247537.0, "step": 2201 }, { "epoch": 4.404708239418984, "grad_norm": 0.08134134840982596, "learning_rate": 5.295173274953345e-06, "loss": 0.4681, "num_tokens": 9196441841.0, "step": 2202 }, { "epoch": 4.406711745554721, "grad_norm": 0.08155956028334813, "learning_rate": 5.2864838644648416e-06, "loss": 0.4805, "num_tokens": 9200636145.0, "step": 2203 }, { "epoch": 4.408715251690459, "grad_norm": 0.08054128504006235, "learning_rate": 5.277822620728821e-06, "loss": 0.473, "num_tokens": 9204801650.0, "step": 2204 }, { "epoch": 4.410718757826196, "grad_norm": 0.07579105379290987, "learning_rate": 5.269189558341801e-06, "loss": 0.4679, "num_tokens": 9208981963.0, "step": 2205 }, { "epoch": 4.412722263961934, "grad_norm": 0.08333354166486033, "learning_rate": 5.260584691852802e-06, "loss": 0.4914, "num_tokens": 9213165224.0, "step": 2206 }, { "epoch": 4.414725770097671, "grad_norm": 0.09335800631545654, "learning_rate": 5.252008035763329e-06, "loss": 0.466, "num_tokens": 9217294148.0, "step": 2207 }, { "epoch": 4.416729276233409, "grad_norm": 0.08378992838486184, "learning_rate": 5.243459604527349e-06, "loss": 0.4646, "num_tokens": 9221477134.0, "step": 2208 }, { "epoch": 4.418732782369146, "grad_norm": 0.07568409154864307, "learning_rate": 5.234939412551257e-06, "loss": 0.4701, "num_tokens": 9225671438.0, "step": 2209 }, { "epoch": 4.420736288504884, "grad_norm": 0.08224810072155062, "learning_rate": 5.22644747419386e-06, "loss": 0.4791, "num_tokens": 9229855445.0, "step": 2210 }, { "epoch": 4.422739794640621, "grad_norm": 0.08097285132931376, "learning_rate": 5.217983803766348e-06, "loss": 0.4883, "num_tokens": 9234021696.0, "step": 2211 }, { "epoch": 4.424743300776359, "grad_norm": 0.08013897697558554, "learning_rate": 5.209548415532272e-06, "loss": 0.4673, "num_tokens": 9238216000.0, "step": 2212 }, { "epoch": 4.426746806912096, "grad_norm": 0.07840613735440956, "learning_rate": 5.201141323707527e-06, "loss": 0.4702, "num_tokens": 9242410304.0, "step": 2213 }, { "epoch": 4.428750313047834, "grad_norm": 0.0788015389329259, "learning_rate": 5.192762542460307e-06, "loss": 0.4641, "num_tokens": 9246604608.0, "step": 2214 }, { "epoch": 4.430753819183571, "grad_norm": 0.0823891342863297, "learning_rate": 5.1844120859111105e-06, "loss": 0.4732, "num_tokens": 9250798912.0, "step": 2215 }, { "epoch": 4.432757325319309, "grad_norm": 0.09251830319902773, "learning_rate": 5.176089968132683e-06, "loss": 0.4684, "num_tokens": 9254953324.0, "step": 2216 }, { "epoch": 4.434760831455046, "grad_norm": 0.07973234363754039, "learning_rate": 5.167796203150038e-06, "loss": 0.4818, "num_tokens": 9259134905.0, "step": 2217 }, { "epoch": 4.436764337590784, "grad_norm": 0.08726960349404847, "learning_rate": 5.159530804940379e-06, "loss": 0.4797, "num_tokens": 9263299076.0, "step": 2218 }, { "epoch": 4.438767843726521, "grad_norm": 0.07678503260793491, "learning_rate": 5.151293787433126e-06, "loss": 0.48, "num_tokens": 9267479889.0, "step": 2219 }, { "epoch": 4.440771349862259, "grad_norm": 0.08578403001286034, "learning_rate": 5.143085164509851e-06, "loss": 0.4691, "num_tokens": 9271649190.0, "step": 2220 }, { "epoch": 4.442774855997996, "grad_norm": 0.09699841545037345, "learning_rate": 5.134904950004292e-06, "loss": 0.4576, "num_tokens": 9275843494.0, "step": 2221 }, { "epoch": 4.444778362133734, "grad_norm": 0.08812442953738403, "learning_rate": 5.126753157702294e-06, "loss": 0.4854, "num_tokens": 9280037798.0, "step": 2222 }, { "epoch": 4.446781868269472, "grad_norm": 0.07915318999304574, "learning_rate": 5.118629801341814e-06, "loss": 0.4745, "num_tokens": 9284227630.0, "step": 2223 }, { "epoch": 4.448785374405209, "grad_norm": 0.0845272290316268, "learning_rate": 5.110534894612885e-06, "loss": 0.4618, "num_tokens": 9288399403.0, "step": 2224 }, { "epoch": 4.450788880540947, "grad_norm": 0.1007653291401267, "learning_rate": 5.10246845115759e-06, "loss": 0.4872, "num_tokens": 9292593707.0, "step": 2225 }, { "epoch": 4.452792386676684, "grad_norm": 0.0833574584300091, "learning_rate": 5.094430484570052e-06, "loss": 0.4595, "num_tokens": 9296764381.0, "step": 2226 }, { "epoch": 4.454795892812422, "grad_norm": 0.07842789598736898, "learning_rate": 5.086421008396398e-06, "loss": 0.4752, "num_tokens": 9300958685.0, "step": 2227 }, { "epoch": 4.456799398948159, "grad_norm": 0.07751514018845777, "learning_rate": 5.078440036134739e-06, "loss": 0.4698, "num_tokens": 9305130801.0, "step": 2228 }, { "epoch": 4.458802905083897, "grad_norm": 0.09567109367872986, "learning_rate": 5.070487581235149e-06, "loss": 0.4745, "num_tokens": 9309325105.0, "step": 2229 }, { "epoch": 4.460806411219634, "grad_norm": 0.07978663274661581, "learning_rate": 5.062563657099653e-06, "loss": 0.4617, "num_tokens": 9313493405.0, "step": 2230 }, { "epoch": 4.462809917355372, "grad_norm": 0.07494050373209378, "learning_rate": 5.0546682770821775e-06, "loss": 0.469, "num_tokens": 9317687709.0, "step": 2231 }, { "epoch": 4.464813423491109, "grad_norm": 0.0814984489808966, "learning_rate": 5.046801454488558e-06, "loss": 0.4816, "num_tokens": 9321872732.0, "step": 2232 }, { "epoch": 4.466816929626847, "grad_norm": 0.08151810808904057, "learning_rate": 5.0389632025764945e-06, "loss": 0.471, "num_tokens": 9326067036.0, "step": 2233 }, { "epoch": 4.468820435762584, "grad_norm": 0.08152469956649369, "learning_rate": 5.031153534555545e-06, "loss": 0.4623, "num_tokens": 9330254895.0, "step": 2234 }, { "epoch": 4.470823941898322, "grad_norm": 0.08857726337383608, "learning_rate": 5.023372463587095e-06, "loss": 0.476, "num_tokens": 9334430571.0, "step": 2235 }, { "epoch": 4.472827448034059, "grad_norm": 0.08038765478420153, "learning_rate": 5.015620002784326e-06, "loss": 0.4642, "num_tokens": 9338615211.0, "step": 2236 }, { "epoch": 4.474830954169797, "grad_norm": 0.08606573187948738, "learning_rate": 5.007896165212216e-06, "loss": 0.4743, "num_tokens": 9342809515.0, "step": 2237 }, { "epoch": 4.476834460305534, "grad_norm": 0.08392604438188199, "learning_rate": 5.000200963887503e-06, "loss": 0.4764, "num_tokens": 9346993058.0, "step": 2238 }, { "epoch": 4.478837966441272, "grad_norm": 0.0839633934196141, "learning_rate": 4.992534411778657e-06, "loss": 0.4592, "num_tokens": 9351187362.0, "step": 2239 }, { "epoch": 4.480841472577009, "grad_norm": 0.0802213829694207, "learning_rate": 4.984896521805877e-06, "loss": 0.4686, "num_tokens": 9355381666.0, "step": 2240 }, { "epoch": 4.482844978712747, "grad_norm": 0.08014100980155377, "learning_rate": 4.9772873068410504e-06, "loss": 0.473, "num_tokens": 9359575970.0, "step": 2241 }, { "epoch": 4.484848484848484, "grad_norm": 0.07654254946996814, "learning_rate": 4.969706779707748e-06, "loss": 0.4675, "num_tokens": 9363770274.0, "step": 2242 }, { "epoch": 4.486851990984222, "grad_norm": 0.08198393958057223, "learning_rate": 4.96215495318118e-06, "loss": 0.4599, "num_tokens": 9367910842.0, "step": 2243 }, { "epoch": 4.48885549711996, "grad_norm": 0.0804781119324802, "learning_rate": 4.9546318399882096e-06, "loss": 0.4717, "num_tokens": 9372091208.0, "step": 2244 }, { "epoch": 4.490859003255697, "grad_norm": 0.07728322931074937, "learning_rate": 4.947137452807288e-06, "loss": 0.4701, "num_tokens": 9376285512.0, "step": 2245 }, { "epoch": 4.492862509391435, "grad_norm": 0.07440998815687597, "learning_rate": 4.939671804268472e-06, "loss": 0.4732, "num_tokens": 9380454342.0, "step": 2246 }, { "epoch": 4.4948660155271725, "grad_norm": 0.08601969955971149, "learning_rate": 4.932234906953372e-06, "loss": 0.4587, "num_tokens": 9384648646.0, "step": 2247 }, { "epoch": 4.49686952166291, "grad_norm": 0.07531001037533526, "learning_rate": 4.924826773395162e-06, "loss": 0.4634, "num_tokens": 9388813969.0, "step": 2248 }, { "epoch": 4.4988730277986475, "grad_norm": 0.07898440801629326, "learning_rate": 4.917447416078525e-06, "loss": 0.4794, "num_tokens": 9393008273.0, "step": 2249 }, { "epoch": 4.5008765339343855, "grad_norm": 0.08168348470392132, "learning_rate": 4.910096847439659e-06, "loss": 0.4704, "num_tokens": 9397202577.0, "step": 2250 }, { "epoch": 4.5028800400701225, "grad_norm": 0.07583047643697971, "learning_rate": 4.902775079866236e-06, "loss": 0.4681, "num_tokens": 9401392274.0, "step": 2251 }, { "epoch": 4.5048835462058605, "grad_norm": 0.08201503064200664, "learning_rate": 4.8954821256974e-06, "loss": 0.4724, "num_tokens": 9405586028.0, "step": 2252 }, { "epoch": 4.506887052341598, "grad_norm": 0.08095737892370239, "learning_rate": 4.888217997223731e-06, "loss": 0.4716, "num_tokens": 9409729281.0, "step": 2253 }, { "epoch": 4.5088905584773356, "grad_norm": 0.08151004867790786, "learning_rate": 4.880982706687237e-06, "loss": 0.4594, "num_tokens": 9413923585.0, "step": 2254 }, { "epoch": 4.510894064613073, "grad_norm": 0.08447447712683138, "learning_rate": 4.873776266281313e-06, "loss": 0.4748, "num_tokens": 9418101487.0, "step": 2255 }, { "epoch": 4.512897570748811, "grad_norm": 0.07941577537358513, "learning_rate": 4.8665986881507494e-06, "loss": 0.463, "num_tokens": 9422285352.0, "step": 2256 }, { "epoch": 4.514901076884548, "grad_norm": 0.08113007208066002, "learning_rate": 4.859449984391681e-06, "loss": 0.4698, "num_tokens": 9426479656.0, "step": 2257 }, { "epoch": 4.516904583020286, "grad_norm": 0.08527396801362119, "learning_rate": 4.852330167051595e-06, "loss": 0.4701, "num_tokens": 9430645097.0, "step": 2258 }, { "epoch": 4.518908089156023, "grad_norm": 0.08117126142383972, "learning_rate": 4.845239248129287e-06, "loss": 0.461, "num_tokens": 9434827611.0, "step": 2259 }, { "epoch": 4.520911595291761, "grad_norm": 0.07946517604581484, "learning_rate": 4.838177239574857e-06, "loss": 0.47, "num_tokens": 9439011924.0, "step": 2260 }, { "epoch": 4.522915101427498, "grad_norm": 0.08207859006910842, "learning_rate": 4.8311441532896766e-06, "loss": 0.4626, "num_tokens": 9443206228.0, "step": 2261 }, { "epoch": 4.524918607563236, "grad_norm": 0.08705541806414305, "learning_rate": 4.8241400011263916e-06, "loss": 0.4801, "num_tokens": 9447400532.0, "step": 2262 }, { "epoch": 4.526922113698973, "grad_norm": 0.07776399944760348, "learning_rate": 4.817164794888865e-06, "loss": 0.4798, "num_tokens": 9451576931.0, "step": 2263 }, { "epoch": 4.528925619834711, "grad_norm": 0.08205755976785148, "learning_rate": 4.810218546332193e-06, "loss": 0.4735, "num_tokens": 9455771235.0, "step": 2264 }, { "epoch": 4.530929125970449, "grad_norm": 0.08418094841194479, "learning_rate": 4.80330126716266e-06, "loss": 0.4851, "num_tokens": 9459950060.0, "step": 2265 }, { "epoch": 4.532932632106186, "grad_norm": 0.09042866596891103, "learning_rate": 4.796412969037742e-06, "loss": 0.4598, "num_tokens": 9464121024.0, "step": 2266 }, { "epoch": 4.534936138241923, "grad_norm": 0.08028995606732664, "learning_rate": 4.789553663566062e-06, "loss": 0.4666, "num_tokens": 9468315328.0, "step": 2267 }, { "epoch": 4.536939644377661, "grad_norm": 0.09016608021913203, "learning_rate": 4.782723362307389e-06, "loss": 0.4715, "num_tokens": 9472482588.0, "step": 2268 }, { "epoch": 4.538943150513399, "grad_norm": 0.07907596125962106, "learning_rate": 4.77592207677261e-06, "loss": 0.4657, "num_tokens": 9476659665.0, "step": 2269 }, { "epoch": 4.540946656649136, "grad_norm": 0.07976697554310207, "learning_rate": 4.769149818423712e-06, "loss": 0.48, "num_tokens": 9480853969.0, "step": 2270 }, { "epoch": 4.542950162784874, "grad_norm": 0.08429496070794516, "learning_rate": 4.762406598673766e-06, "loss": 0.4708, "num_tokens": 9485016283.0, "step": 2271 }, { "epoch": 4.544953668920611, "grad_norm": 0.0812787586931539, "learning_rate": 4.755692428886906e-06, "loss": 0.4743, "num_tokens": 9489210587.0, "step": 2272 }, { "epoch": 4.546957175056349, "grad_norm": 0.08996536230301652, "learning_rate": 4.749007320378305e-06, "loss": 0.4679, "num_tokens": 9493394144.0, "step": 2273 }, { "epoch": 4.548960681192086, "grad_norm": 0.08946864455139299, "learning_rate": 4.742351284414163e-06, "loss": 0.4642, "num_tokens": 9497588448.0, "step": 2274 }, { "epoch": 4.550964187327824, "grad_norm": 0.08528904756899573, "learning_rate": 4.7357243322116795e-06, "loss": 0.4767, "num_tokens": 9501774352.0, "step": 2275 }, { "epoch": 4.552967693463561, "grad_norm": 0.08617413692112791, "learning_rate": 4.72912647493905e-06, "loss": 0.4738, "num_tokens": 9505968656.0, "step": 2276 }, { "epoch": 4.554971199599299, "grad_norm": 0.07618935920272712, "learning_rate": 4.722557723715427e-06, "loss": 0.4636, "num_tokens": 9510162960.0, "step": 2277 }, { "epoch": 4.556974705735036, "grad_norm": 0.07762622250787078, "learning_rate": 4.71601808961092e-06, "loss": 0.4607, "num_tokens": 9514330281.0, "step": 2278 }, { "epoch": 4.558978211870774, "grad_norm": 0.0756014796154467, "learning_rate": 4.709507583646559e-06, "loss": 0.4792, "num_tokens": 9518510024.0, "step": 2279 }, { "epoch": 4.560981718006511, "grad_norm": 0.08433306676827722, "learning_rate": 4.703026216794302e-06, "loss": 0.4773, "num_tokens": 9522680903.0, "step": 2280 }, { "epoch": 4.562985224142249, "grad_norm": 0.08718139972809358, "learning_rate": 4.696573999976976e-06, "loss": 0.4828, "num_tokens": 9526875207.0, "step": 2281 }, { "epoch": 4.564988730277986, "grad_norm": 0.08394073440775082, "learning_rate": 4.6901509440683025e-06, "loss": 0.4744, "num_tokens": 9531063885.0, "step": 2282 }, { "epoch": 4.566992236413724, "grad_norm": 0.08601402807545853, "learning_rate": 4.683757059892847e-06, "loss": 0.4694, "num_tokens": 9535244730.0, "step": 2283 }, { "epoch": 4.568995742549461, "grad_norm": 0.0802233830734959, "learning_rate": 4.677392358226025e-06, "loss": 0.473, "num_tokens": 9539418495.0, "step": 2284 }, { "epoch": 4.570999248685199, "grad_norm": 0.0828954682510329, "learning_rate": 4.671056849794052e-06, "loss": 0.472, "num_tokens": 9543582759.0, "step": 2285 }, { "epoch": 4.573002754820937, "grad_norm": 0.0938170164583961, "learning_rate": 4.664750545273967e-06, "loss": 0.4739, "num_tokens": 9547749170.0, "step": 2286 }, { "epoch": 4.575006260956674, "grad_norm": 0.07916568089540423, "learning_rate": 4.6584734552935786e-06, "loss": 0.4721, "num_tokens": 9551943474.0, "step": 2287 }, { "epoch": 4.577009767092411, "grad_norm": 0.07988456535736731, "learning_rate": 4.652225590431468e-06, "loss": 0.4591, "num_tokens": 9556136488.0, "step": 2288 }, { "epoch": 4.579013273228149, "grad_norm": 0.08861242129119136, "learning_rate": 4.646006961216959e-06, "loss": 0.4634, "num_tokens": 9560330792.0, "step": 2289 }, { "epoch": 4.581016779363887, "grad_norm": 0.08361013461745702, "learning_rate": 4.639817578130109e-06, "loss": 0.4614, "num_tokens": 9564504930.0, "step": 2290 }, { "epoch": 4.583020285499624, "grad_norm": 0.08097301192021572, "learning_rate": 4.633657451601683e-06, "loss": 0.4788, "num_tokens": 9568692383.0, "step": 2291 }, { "epoch": 4.585023791635362, "grad_norm": 0.083083973727066, "learning_rate": 4.62752659201315e-06, "loss": 0.4672, "num_tokens": 9572873927.0, "step": 2292 }, { "epoch": 4.587027297771099, "grad_norm": 0.0843715578347384, "learning_rate": 4.621425009696648e-06, "loss": 0.4868, "num_tokens": 9577065826.0, "step": 2293 }, { "epoch": 4.589030803906837, "grad_norm": 0.08233846069088553, "learning_rate": 4.615352714934978e-06, "loss": 0.4637, "num_tokens": 9581226907.0, "step": 2294 }, { "epoch": 4.591034310042574, "grad_norm": 0.08776643659452908, "learning_rate": 4.6093097179615845e-06, "loss": 0.4766, "num_tokens": 9585413964.0, "step": 2295 }, { "epoch": 4.593037816178312, "grad_norm": 0.07590392384337867, "learning_rate": 4.6032960289605345e-06, "loss": 0.4618, "num_tokens": 9589608268.0, "step": 2296 }, { "epoch": 4.595041322314049, "grad_norm": 0.08413674839588552, "learning_rate": 4.597311658066509e-06, "loss": 0.4697, "num_tokens": 9593802572.0, "step": 2297 }, { "epoch": 4.597044828449787, "grad_norm": 0.08682100652949869, "learning_rate": 4.5913566153647735e-06, "loss": 0.4672, "num_tokens": 9597962614.0, "step": 2298 }, { "epoch": 4.599048334585524, "grad_norm": 0.07836024801330817, "learning_rate": 4.585430910891169e-06, "loss": 0.4747, "num_tokens": 9602153761.0, "step": 2299 }, { "epoch": 4.601051840721262, "grad_norm": 0.08018583050092715, "learning_rate": 4.5795345546320955e-06, "loss": 0.4822, "num_tokens": 9606348065.0, "step": 2300 }, { "epoch": 4.6030553468569995, "grad_norm": 0.08152564893963704, "learning_rate": 4.5736675565244985e-06, "loss": 0.4688, "num_tokens": 9610512605.0, "step": 2301 }, { "epoch": 4.605058852992737, "grad_norm": 0.08395657943233324, "learning_rate": 4.567829926455834e-06, "loss": 0.4626, "num_tokens": 9614663302.0, "step": 2302 }, { "epoch": 4.6070623591284745, "grad_norm": 0.08047654344562606, "learning_rate": 4.562021674264079e-06, "loss": 0.4797, "num_tokens": 9618827196.0, "step": 2303 }, { "epoch": 4.6090658652642125, "grad_norm": 0.08886761662496974, "learning_rate": 4.55624280973769e-06, "loss": 0.4789, "num_tokens": 9623021500.0, "step": 2304 }, { "epoch": 4.6110693713999495, "grad_norm": 0.27476898563524016, "learning_rate": 4.550493342615604e-06, "loss": 0.4665, "num_tokens": 9627205173.0, "step": 2305 }, { "epoch": 4.6130728775356875, "grad_norm": 0.09008778500960532, "learning_rate": 4.544773282587213e-06, "loss": 0.4748, "num_tokens": 9631384549.0, "step": 2306 }, { "epoch": 4.6150763836714255, "grad_norm": 0.08925879073869225, "learning_rate": 4.53908263929235e-06, "loss": 0.4672, "num_tokens": 9635533557.0, "step": 2307 }, { "epoch": 4.6170798898071626, "grad_norm": 0.08691573964708726, "learning_rate": 4.533421422321275e-06, "loss": 0.4811, "num_tokens": 9639727861.0, "step": 2308 }, { "epoch": 4.6190833959429, "grad_norm": 0.07907076466858211, "learning_rate": 4.527789641214655e-06, "loss": 0.4594, "num_tokens": 9643913593.0, "step": 2309 }, { "epoch": 4.621086902078638, "grad_norm": 0.07635292984312796, "learning_rate": 4.52218730546355e-06, "loss": 0.4674, "num_tokens": 9648070605.0, "step": 2310 }, { "epoch": 4.623090408214376, "grad_norm": 0.07971453217419544, "learning_rate": 4.516614424509397e-06, "loss": 0.4762, "num_tokens": 9652256007.0, "step": 2311 }, { "epoch": 4.625093914350113, "grad_norm": 0.08861629120279098, "learning_rate": 4.511071007743992e-06, "loss": 0.4734, "num_tokens": 9656444462.0, "step": 2312 }, { "epoch": 4.627097420485851, "grad_norm": 0.07929965132367893, "learning_rate": 4.505557064509479e-06, "loss": 0.4647, "num_tokens": 9660604764.0, "step": 2313 }, { "epoch": 4.629100926621588, "grad_norm": 0.0794129948400999, "learning_rate": 4.500072604098324e-06, "loss": 0.48, "num_tokens": 9664799068.0, "step": 2314 }, { "epoch": 4.631104432757326, "grad_norm": 0.08819450886415212, "learning_rate": 4.494617635753321e-06, "loss": 0.4909, "num_tokens": 9668993372.0, "step": 2315 }, { "epoch": 4.633107938893063, "grad_norm": 0.08340677766896547, "learning_rate": 4.48919216866755e-06, "loss": 0.4595, "num_tokens": 9673182531.0, "step": 2316 }, { "epoch": 4.635111445028801, "grad_norm": 0.08393315888780188, "learning_rate": 4.4837962119843735e-06, "loss": 0.4836, "num_tokens": 9677348782.0, "step": 2317 }, { "epoch": 4.637114951164538, "grad_norm": 0.07643839581853494, "learning_rate": 4.478429774797427e-06, "loss": 0.4614, "num_tokens": 9681523084.0, "step": 2318 }, { "epoch": 4.639118457300276, "grad_norm": 0.07737583651672673, "learning_rate": 4.4730928661505954e-06, "loss": 0.4686, "num_tokens": 9685717388.0, "step": 2319 }, { "epoch": 4.641121963436013, "grad_norm": 0.08187926942568731, "learning_rate": 4.467785495037998e-06, "loss": 0.4726, "num_tokens": 9689905276.0, "step": 2320 }, { "epoch": 4.643125469571751, "grad_norm": 0.0845936121230918, "learning_rate": 4.462507670403979e-06, "loss": 0.4744, "num_tokens": 9694099580.0, "step": 2321 }, { "epoch": 4.645128975707488, "grad_norm": 0.08549289802259942, "learning_rate": 4.457259401143083e-06, "loss": 0.4789, "num_tokens": 9698293884.0, "step": 2322 }, { "epoch": 4.647132481843226, "grad_norm": 0.08092692805637505, "learning_rate": 4.452040696100052e-06, "loss": 0.4696, "num_tokens": 9702488188.0, "step": 2323 }, { "epoch": 4.649135987978963, "grad_norm": 0.08019311411372011, "learning_rate": 4.446851564069802e-06, "loss": 0.4773, "num_tokens": 9706667220.0, "step": 2324 }, { "epoch": 4.651139494114701, "grad_norm": 0.08865913383159628, "learning_rate": 4.441692013797414e-06, "loss": 0.4773, "num_tokens": 9710856054.0, "step": 2325 }, { "epoch": 4.653143000250438, "grad_norm": 0.08280464593888466, "learning_rate": 4.4365620539781065e-06, "loss": 0.4777, "num_tokens": 9715050358.0, "step": 2326 }, { "epoch": 4.655146506386176, "grad_norm": 0.08170210498435543, "learning_rate": 4.431461693257242e-06, "loss": 0.4773, "num_tokens": 9719244662.0, "step": 2327 }, { "epoch": 4.657150012521913, "grad_norm": 0.08294911308795994, "learning_rate": 4.426390940230292e-06, "loss": 0.4623, "num_tokens": 9723438966.0, "step": 2328 }, { "epoch": 4.659153518657651, "grad_norm": 0.08634439689260537, "learning_rate": 4.421349803442833e-06, "loss": 0.4641, "num_tokens": 9727620465.0, "step": 2329 }, { "epoch": 4.661157024793388, "grad_norm": 0.07904530308728323, "learning_rate": 4.416338291390533e-06, "loss": 0.4843, "num_tokens": 9731791799.0, "step": 2330 }, { "epoch": 4.663160530929126, "grad_norm": 0.07758984844521694, "learning_rate": 4.411356412519131e-06, "loss": 0.4683, "num_tokens": 9735960463.0, "step": 2331 }, { "epoch": 4.665164037064864, "grad_norm": 0.08189288958979543, "learning_rate": 4.406404175224425e-06, "loss": 0.4609, "num_tokens": 9740144191.0, "step": 2332 }, { "epoch": 4.667167543200601, "grad_norm": 0.07687823874965072, "learning_rate": 4.401481587852269e-06, "loss": 0.4657, "num_tokens": 9744338495.0, "step": 2333 }, { "epoch": 4.669171049336339, "grad_norm": 0.08371855647178682, "learning_rate": 4.396588658698535e-06, "loss": 0.4734, "num_tokens": 9748532799.0, "step": 2334 }, { "epoch": 4.671174555472076, "grad_norm": 0.07643700594283652, "learning_rate": 4.391725396009124e-06, "loss": 0.4684, "num_tokens": 9752727103.0, "step": 2335 }, { "epoch": 4.673178061607814, "grad_norm": 0.07855194836415223, "learning_rate": 4.386891807979933e-06, "loss": 0.4788, "num_tokens": 9756914682.0, "step": 2336 }, { "epoch": 4.675181567743551, "grad_norm": 0.08106961732185751, "learning_rate": 4.3820879027568564e-06, "loss": 0.4555, "num_tokens": 9761108986.0, "step": 2337 }, { "epoch": 4.677185073879289, "grad_norm": 0.07879690815755974, "learning_rate": 4.377313688435757e-06, "loss": 0.4833, "num_tokens": 9765303290.0, "step": 2338 }, { "epoch": 4.679188580015026, "grad_norm": 0.08440706414670428, "learning_rate": 4.372569173062469e-06, "loss": 0.4783, "num_tokens": 9769477518.0, "step": 2339 }, { "epoch": 4.681192086150764, "grad_norm": 0.07944374957977621, "learning_rate": 4.367854364632768e-06, "loss": 0.4695, "num_tokens": 9773652049.0, "step": 2340 }, { "epoch": 4.683195592286501, "grad_norm": 0.07531939403217029, "learning_rate": 4.3631692710923715e-06, "loss": 0.4682, "num_tokens": 9777831044.0, "step": 2341 }, { "epoch": 4.685199098422239, "grad_norm": 0.07782345268581248, "learning_rate": 4.358513900336919e-06, "loss": 0.4656, "num_tokens": 9782025348.0, "step": 2342 }, { "epoch": 4.687202604557976, "grad_norm": 0.07809597538768145, "learning_rate": 4.3538882602119566e-06, "loss": 0.4565, "num_tokens": 9786219652.0, "step": 2343 }, { "epoch": 4.689206110693714, "grad_norm": 0.08454586436849781, "learning_rate": 4.3492923585129255e-06, "loss": 0.4792, "num_tokens": 9790413956.0, "step": 2344 }, { "epoch": 4.691209616829451, "grad_norm": 0.08835642003460409, "learning_rate": 4.3447262029851544e-06, "loss": 0.4587, "num_tokens": 9794608260.0, "step": 2345 }, { "epoch": 4.693213122965189, "grad_norm": 0.07531099511838947, "learning_rate": 4.3401898013238335e-06, "loss": 0.4618, "num_tokens": 9798802564.0, "step": 2346 }, { "epoch": 4.695216629100926, "grad_norm": 0.07717905886921189, "learning_rate": 4.3356831611740204e-06, "loss": 0.4813, "num_tokens": 9802996868.0, "step": 2347 }, { "epoch": 4.697220135236664, "grad_norm": 0.08153413251560769, "learning_rate": 4.331206290130608e-06, "loss": 0.4683, "num_tokens": 9807191172.0, "step": 2348 }, { "epoch": 4.699223641372401, "grad_norm": 0.09019387865150598, "learning_rate": 4.326759195738324e-06, "loss": 0.4724, "num_tokens": 9811385476.0, "step": 2349 }, { "epoch": 4.701227147508139, "grad_norm": 0.08020757105766407, "learning_rate": 4.322341885491715e-06, "loss": 0.4689, "num_tokens": 9815579780.0, "step": 2350 }, { "epoch": 4.703230653643876, "grad_norm": 0.08012724307534566, "learning_rate": 4.317954366835133e-06, "loss": 0.4678, "num_tokens": 9819774084.0, "step": 2351 }, { "epoch": 4.705234159779614, "grad_norm": 0.07604799034241251, "learning_rate": 4.313596647162719e-06, "loss": 0.4785, "num_tokens": 9823949717.0, "step": 2352 }, { "epoch": 4.707237665915352, "grad_norm": 0.08701165387040632, "learning_rate": 4.309268733818401e-06, "loss": 0.4728, "num_tokens": 9828144021.0, "step": 2353 }, { "epoch": 4.709241172051089, "grad_norm": 0.07801149909191785, "learning_rate": 4.30497063409587e-06, "loss": 0.4714, "num_tokens": 9832313922.0, "step": 2354 }, { "epoch": 4.7112446781868265, "grad_norm": 0.08489387292921831, "learning_rate": 4.300702355238578e-06, "loss": 0.4602, "num_tokens": 9836484735.0, "step": 2355 }, { "epoch": 4.713248184322564, "grad_norm": 0.07472846374986904, "learning_rate": 4.2964639044397175e-06, "loss": 0.4618, "num_tokens": 9840667716.0, "step": 2356 }, { "epoch": 4.715251690458302, "grad_norm": 0.07897756902816624, "learning_rate": 4.292255288842213e-06, "loss": 0.4724, "num_tokens": 9844862020.0, "step": 2357 }, { "epoch": 4.7172551965940395, "grad_norm": 0.07610409440887012, "learning_rate": 4.288076515538711e-06, "loss": 0.4634, "num_tokens": 9849025492.0, "step": 2358 }, { "epoch": 4.719258702729777, "grad_norm": 0.08001859187187106, "learning_rate": 4.283927591571563e-06, "loss": 0.4716, "num_tokens": 9853219796.0, "step": 2359 }, { "epoch": 4.7212622088655145, "grad_norm": 0.08018861952243106, "learning_rate": 4.279808523932817e-06, "loss": 0.4481, "num_tokens": 9857373430.0, "step": 2360 }, { "epoch": 4.7232657150012525, "grad_norm": 0.0790362492641528, "learning_rate": 4.275719319564207e-06, "loss": 0.4887, "num_tokens": 9861567734.0, "step": 2361 }, { "epoch": 4.7252692211369896, "grad_norm": 0.08487776642347875, "learning_rate": 4.271659985357136e-06, "loss": 0.4644, "num_tokens": 9865732581.0, "step": 2362 }, { "epoch": 4.7272727272727275, "grad_norm": 0.07996621379782065, "learning_rate": 4.267630528152671e-06, "loss": 0.4615, "num_tokens": 9869905034.0, "step": 2363 }, { "epoch": 4.729276233408465, "grad_norm": 0.07752535286648754, "learning_rate": 4.263630954741527e-06, "loss": 0.4817, "num_tokens": 9874099338.0, "step": 2364 }, { "epoch": 4.731279739544203, "grad_norm": 0.07970177104272191, "learning_rate": 4.259661271864055e-06, "loss": 0.4633, "num_tokens": 9878293642.0, "step": 2365 }, { "epoch": 4.73328324567994, "grad_norm": 0.08684493212316392, "learning_rate": 4.255721486210238e-06, "loss": 0.4752, "num_tokens": 9882474871.0, "step": 2366 }, { "epoch": 4.735286751815678, "grad_norm": 0.08561797747046042, "learning_rate": 4.251811604419665e-06, "loss": 0.4618, "num_tokens": 9886669175.0, "step": 2367 }, { "epoch": 4.737290257951415, "grad_norm": 0.07705412321944749, "learning_rate": 4.247931633081538e-06, "loss": 0.4768, "num_tokens": 9890863479.0, "step": 2368 }, { "epoch": 4.739293764087153, "grad_norm": 0.09552944102369168, "learning_rate": 4.244081578734648e-06, "loss": 0.4699, "num_tokens": 9895010090.0, "step": 2369 }, { "epoch": 4.74129727022289, "grad_norm": 0.09433237329670321, "learning_rate": 4.240261447867368e-06, "loss": 0.4712, "num_tokens": 9899204394.0, "step": 2370 }, { "epoch": 4.743300776358628, "grad_norm": 0.08111506768938555, "learning_rate": 4.23647124691764e-06, "loss": 0.4693, "num_tokens": 9903398698.0, "step": 2371 }, { "epoch": 4.745304282494365, "grad_norm": 0.07578720099473646, "learning_rate": 4.232710982272971e-06, "loss": 0.4639, "num_tokens": 9907593002.0, "step": 2372 }, { "epoch": 4.747307788630103, "grad_norm": 0.09005545398112803, "learning_rate": 4.22898066027041e-06, "loss": 0.4752, "num_tokens": 9911771948.0, "step": 2373 }, { "epoch": 4.749311294765841, "grad_norm": 0.09118761577090577, "learning_rate": 4.225280287196556e-06, "loss": 0.4598, "num_tokens": 9915948347.0, "step": 2374 }, { "epoch": 4.751314800901578, "grad_norm": 0.07509042258630633, "learning_rate": 4.221609869287523e-06, "loss": 0.4739, "num_tokens": 9920117342.0, "step": 2375 }, { "epoch": 4.753318307037315, "grad_norm": 0.07559256075103349, "learning_rate": 4.217969412728948e-06, "loss": 0.4562, "num_tokens": 9924311646.0, "step": 2376 }, { "epoch": 4.755321813173053, "grad_norm": 0.07710888588424732, "learning_rate": 4.2143589236559804e-06, "loss": 0.4579, "num_tokens": 9928505950.0, "step": 2377 }, { "epoch": 4.757325319308791, "grad_norm": 0.0802650917639491, "learning_rate": 4.21077840815326e-06, "loss": 0.4662, "num_tokens": 9932674340.0, "step": 2378 }, { "epoch": 4.759328825444528, "grad_norm": 0.09392649736216935, "learning_rate": 4.207227872254911e-06, "loss": 0.4775, "num_tokens": 9936848963.0, "step": 2379 }, { "epoch": 4.761332331580266, "grad_norm": 0.08093299757373923, "learning_rate": 4.203707321944543e-06, "loss": 0.4735, "num_tokens": 9941043267.0, "step": 2380 }, { "epoch": 4.763335837716003, "grad_norm": 0.074938710234304, "learning_rate": 4.200216763155223e-06, "loss": 0.4743, "num_tokens": 9945219439.0, "step": 2381 }, { "epoch": 4.765339343851741, "grad_norm": 0.09209683578923246, "learning_rate": 4.19675620176948e-06, "loss": 0.4696, "num_tokens": 9949413743.0, "step": 2382 }, { "epoch": 4.767342849987478, "grad_norm": 0.07868623869777042, "learning_rate": 4.193325643619284e-06, "loss": 0.4637, "num_tokens": 9953549469.0, "step": 2383 }, { "epoch": 4.769346356123216, "grad_norm": 0.08026610506211959, "learning_rate": 4.189925094486049e-06, "loss": 0.4767, "num_tokens": 9957743773.0, "step": 2384 }, { "epoch": 4.771349862258953, "grad_norm": 0.08647274751322725, "learning_rate": 4.186554560100605e-06, "loss": 0.4679, "num_tokens": 9961912886.0, "step": 2385 }, { "epoch": 4.773353368394691, "grad_norm": 0.08069480988655821, "learning_rate": 4.1832140461432125e-06, "loss": 0.478, "num_tokens": 9966107190.0, "step": 2386 }, { "epoch": 4.775356874530428, "grad_norm": 0.08044575731541408, "learning_rate": 4.179903558243526e-06, "loss": 0.4624, "num_tokens": 9970278809.0, "step": 2387 }, { "epoch": 4.777360380666166, "grad_norm": 0.07774785557753178, "learning_rate": 4.176623101980611e-06, "loss": 0.4721, "num_tokens": 9974448626.0, "step": 2388 }, { "epoch": 4.779363886801903, "grad_norm": 0.07863614222210205, "learning_rate": 4.173372682882909e-06, "loss": 0.4552, "num_tokens": 9978632803.0, "step": 2389 }, { "epoch": 4.781367392937641, "grad_norm": 0.07781362162853066, "learning_rate": 4.170152306428252e-06, "loss": 0.4791, "num_tokens": 9982827107.0, "step": 2390 }, { "epoch": 4.783370899073378, "grad_norm": 0.08055438178954676, "learning_rate": 4.166961978043834e-06, "loss": 0.4844, "num_tokens": 9986963595.0, "step": 2391 }, { "epoch": 4.785374405209116, "grad_norm": 0.08347212150999164, "learning_rate": 4.163801703106214e-06, "loss": 0.4757, "num_tokens": 9991138673.0, "step": 2392 }, { "epoch": 4.787377911344853, "grad_norm": 0.0787033915251486, "learning_rate": 4.160671486941304e-06, "loss": 0.4609, "num_tokens": 9995332977.0, "step": 2393 }, { "epoch": 4.789381417480591, "grad_norm": 0.0804046253653497, "learning_rate": 4.157571334824353e-06, "loss": 0.4747, "num_tokens": 9999512559.0, "step": 2394 }, { "epoch": 4.791384923616329, "grad_norm": 0.07979999854338546, "learning_rate": 4.1545012519799495e-06, "loss": 0.4721, "num_tokens": 10003701790.0, "step": 2395 }, { "epoch": 4.793388429752066, "grad_norm": 0.07737864806319746, "learning_rate": 4.151461243582007e-06, "loss": 0.4741, "num_tokens": 10007896094.0, "step": 2396 }, { "epoch": 4.795391935887803, "grad_norm": 0.08816407810586308, "learning_rate": 4.148451314753752e-06, "loss": 0.4753, "num_tokens": 10012067594.0, "step": 2397 }, { "epoch": 4.797395442023541, "grad_norm": 0.08377205868477101, "learning_rate": 4.145471470567724e-06, "loss": 0.4711, "num_tokens": 10016261898.0, "step": 2398 }, { "epoch": 4.799398948159279, "grad_norm": 0.08927337643133353, "learning_rate": 4.142521716045754e-06, "loss": 0.4665, "num_tokens": 10020437359.0, "step": 2399 }, { "epoch": 4.801402454295016, "grad_norm": 0.07936916957279566, "learning_rate": 4.139602056158975e-06, "loss": 0.4744, "num_tokens": 10024631663.0, "step": 2400 }, { "epoch": 4.803405960430754, "grad_norm": 0.08272083631666714, "learning_rate": 4.136712495827789e-06, "loss": 0.4653, "num_tokens": 10028804631.0, "step": 2401 }, { "epoch": 4.805409466566491, "grad_norm": 0.08609555252337277, "learning_rate": 4.133853039921884e-06, "loss": 0.4764, "num_tokens": 10032998935.0, "step": 2402 }, { "epoch": 4.807412972702229, "grad_norm": 0.07635367418981714, "learning_rate": 4.131023693260207e-06, "loss": 0.468, "num_tokens": 10037158969.0, "step": 2403 }, { "epoch": 4.809416478837966, "grad_norm": 0.08491019283466879, "learning_rate": 4.12822446061097e-06, "loss": 0.4728, "num_tokens": 10041344972.0, "step": 2404 }, { "epoch": 4.811419984973704, "grad_norm": 0.08083407367575697, "learning_rate": 4.125455346691625e-06, "loss": 0.4604, "num_tokens": 10045539276.0, "step": 2405 }, { "epoch": 4.813423491109441, "grad_norm": 0.08057963874855378, "learning_rate": 4.12271635616887e-06, "loss": 0.4767, "num_tokens": 10049702390.0, "step": 2406 }, { "epoch": 4.815426997245179, "grad_norm": 0.08138676829777723, "learning_rate": 4.120007493658641e-06, "loss": 0.4667, "num_tokens": 10053872005.0, "step": 2407 }, { "epoch": 4.817430503380916, "grad_norm": 0.0849379976271702, "learning_rate": 4.117328763726097e-06, "loss": 0.4827, "num_tokens": 10058049440.0, "step": 2408 }, { "epoch": 4.819434009516654, "grad_norm": 0.0826263064231753, "learning_rate": 4.114680170885614e-06, "loss": 0.4732, "num_tokens": 10062243744.0, "step": 2409 }, { "epoch": 4.821437515652391, "grad_norm": 0.076745310119594, "learning_rate": 4.112061719600783e-06, "loss": 0.479, "num_tokens": 10066421097.0, "step": 2410 }, { "epoch": 4.823441021788129, "grad_norm": 0.07546297138512073, "learning_rate": 4.109473414284392e-06, "loss": 0.4734, "num_tokens": 10070615401.0, "step": 2411 }, { "epoch": 4.8254445279238665, "grad_norm": 0.08315705018275754, "learning_rate": 4.10691525929843e-06, "loss": 0.4704, "num_tokens": 10074809705.0, "step": 2412 }, { "epoch": 4.827448034059604, "grad_norm": 0.07983377743226303, "learning_rate": 4.104387258954077e-06, "loss": 0.4894, "num_tokens": 10078977118.0, "step": 2413 }, { "epoch": 4.8294515401953415, "grad_norm": 0.0787318678138568, "learning_rate": 4.101889417511687e-06, "loss": 0.4663, "num_tokens": 10083151631.0, "step": 2414 }, { "epoch": 4.8314550463310795, "grad_norm": 0.0787624012497149, "learning_rate": 4.0994217391807935e-06, "loss": 0.4627, "num_tokens": 10087345935.0, "step": 2415 }, { "epoch": 4.8334585524668165, "grad_norm": 0.08018770738087942, "learning_rate": 4.0969842281200964e-06, "loss": 0.4737, "num_tokens": 10091493546.0, "step": 2416 }, { "epoch": 4.8354620586025545, "grad_norm": 0.07967600860184142, "learning_rate": 4.094576888437453e-06, "loss": 0.4682, "num_tokens": 10095672422.0, "step": 2417 }, { "epoch": 4.837465564738292, "grad_norm": 0.08876412988000522, "learning_rate": 4.092199724189878e-06, "loss": 0.4572, "num_tokens": 10099866726.0, "step": 2418 }, { "epoch": 4.8394690708740296, "grad_norm": 0.08611452514277908, "learning_rate": 4.089852739383527e-06, "loss": 0.4705, "num_tokens": 10104061030.0, "step": 2419 }, { "epoch": 4.8414725770097675, "grad_norm": 0.08078576101363474, "learning_rate": 4.087535937973704e-06, "loss": 0.4692, "num_tokens": 10108231741.0, "step": 2420 }, { "epoch": 4.843476083145505, "grad_norm": 0.08210619124552526, "learning_rate": 4.085249323864831e-06, "loss": 0.4919, "num_tokens": 10112415266.0, "step": 2421 }, { "epoch": 4.845479589281243, "grad_norm": 0.08709446779395594, "learning_rate": 4.082992900910472e-06, "loss": 0.4831, "num_tokens": 10116609570.0, "step": 2422 }, { "epoch": 4.84748309541698, "grad_norm": 0.08264068013292836, "learning_rate": 4.080766672913304e-06, "loss": 0.4619, "num_tokens": 10120803874.0, "step": 2423 }, { "epoch": 4.849486601552718, "grad_norm": 0.07896917455881926, "learning_rate": 4.078570643625115e-06, "loss": 0.4634, "num_tokens": 10124998178.0, "step": 2424 }, { "epoch": 4.851490107688455, "grad_norm": 0.07724630351497967, "learning_rate": 4.076404816746805e-06, "loss": 0.4824, "num_tokens": 10129178092.0, "step": 2425 }, { "epoch": 4.853493613824193, "grad_norm": 0.08485204174370385, "learning_rate": 4.07426919592837e-06, "loss": 0.4653, "num_tokens": 10133355265.0, "step": 2426 }, { "epoch": 4.85549711995993, "grad_norm": 0.1046563093563074, "learning_rate": 4.072163784768906e-06, "loss": 0.4731, "num_tokens": 10137549569.0, "step": 2427 }, { "epoch": 4.857500626095668, "grad_norm": 0.08596780723573927, "learning_rate": 4.0700885868165915e-06, "loss": 0.475, "num_tokens": 10141731495.0, "step": 2428 }, { "epoch": 4.859504132231405, "grad_norm": 0.08223952494904564, "learning_rate": 4.068043605568696e-06, "loss": 0.4707, "num_tokens": 10145921408.0, "step": 2429 }, { "epoch": 4.861507638367143, "grad_norm": 0.07962309433095992, "learning_rate": 4.0660288444715565e-06, "loss": 0.466, "num_tokens": 10150115712.0, "step": 2430 }, { "epoch": 4.86351114450288, "grad_norm": 0.09285155500532516, "learning_rate": 4.064044306920585e-06, "loss": 0.4656, "num_tokens": 10154266666.0, "step": 2431 }, { "epoch": 4.865514650638618, "grad_norm": 0.09944288417746402, "learning_rate": 4.062089996260264e-06, "loss": 0.4539, "num_tokens": 10158460970.0, "step": 2432 }, { "epoch": 4.867518156774355, "grad_norm": 0.07815887574137524, "learning_rate": 4.060165915784124e-06, "loss": 0.4861, "num_tokens": 10162636699.0, "step": 2433 }, { "epoch": 4.869521662910093, "grad_norm": 0.0839526827063354, "learning_rate": 4.05827206873476e-06, "loss": 0.4788, "num_tokens": 10166824887.0, "step": 2434 }, { "epoch": 4.87152516904583, "grad_norm": 0.08942394907122662, "learning_rate": 4.056408458303812e-06, "loss": 0.4762, "num_tokens": 10171019191.0, "step": 2435 }, { "epoch": 4.873528675181568, "grad_norm": 0.08052410207370392, "learning_rate": 4.05457508763196e-06, "loss": 0.4742, "num_tokens": 10175213495.0, "step": 2436 }, { "epoch": 4.875532181317305, "grad_norm": 0.08571503321335017, "learning_rate": 4.05277195980893e-06, "loss": 0.4727, "num_tokens": 10179406967.0, "step": 2437 }, { "epoch": 4.877535687453043, "grad_norm": 0.0816524336082229, "learning_rate": 4.050999077873468e-06, "loss": 0.4738, "num_tokens": 10183598203.0, "step": 2438 }, { "epoch": 4.87953919358878, "grad_norm": 0.08145048971766346, "learning_rate": 4.04925644481336e-06, "loss": 0.4587, "num_tokens": 10187766609.0, "step": 2439 }, { "epoch": 4.881542699724518, "grad_norm": 0.08484522393625951, "learning_rate": 4.047544063565411e-06, "loss": 0.4733, "num_tokens": 10191896655.0, "step": 2440 }, { "epoch": 4.883546205860256, "grad_norm": 0.08220706989448018, "learning_rate": 4.045861937015437e-06, "loss": 0.4712, "num_tokens": 10196090959.0, "step": 2441 }, { "epoch": 4.885549711995993, "grad_norm": 0.07976054330510622, "learning_rate": 4.044210067998275e-06, "loss": 0.4861, "num_tokens": 10200285263.0, "step": 2442 }, { "epoch": 4.88755321813173, "grad_norm": 0.07948573409971085, "learning_rate": 4.042588459297768e-06, "loss": 0.4638, "num_tokens": 10204454319.0, "step": 2443 }, { "epoch": 4.889556724267468, "grad_norm": 0.08169102210734121, "learning_rate": 4.040997113646759e-06, "loss": 0.4584, "num_tokens": 10208635104.0, "step": 2444 }, { "epoch": 4.891560230403206, "grad_norm": 0.0840775379036063, "learning_rate": 4.0394360337270935e-06, "loss": 0.4689, "num_tokens": 10212794674.0, "step": 2445 }, { "epoch": 4.893563736538943, "grad_norm": 0.07741464417871936, "learning_rate": 4.037905222169606e-06, "loss": 0.4717, "num_tokens": 10216945941.0, "step": 2446 }, { "epoch": 4.895567242674681, "grad_norm": 0.0875686547727603, "learning_rate": 4.036404681554128e-06, "loss": 0.4793, "num_tokens": 10221140245.0, "step": 2447 }, { "epoch": 4.897570748810418, "grad_norm": 0.07684656359669303, "learning_rate": 4.034934414409469e-06, "loss": 0.4639, "num_tokens": 10225303515.0, "step": 2448 }, { "epoch": 4.899574254946156, "grad_norm": 0.07729604189979324, "learning_rate": 4.033494423213428e-06, "loss": 0.4753, "num_tokens": 10229497819.0, "step": 2449 }, { "epoch": 4.901577761081893, "grad_norm": 0.07930307696464209, "learning_rate": 4.032084710392771e-06, "loss": 0.4641, "num_tokens": 10233692123.0, "step": 2450 }, { "epoch": 4.903581267217631, "grad_norm": 0.0816732471530333, "learning_rate": 4.030705278323242e-06, "loss": 0.4732, "num_tokens": 10237867333.0, "step": 2451 }, { "epoch": 4.905584773353368, "grad_norm": 0.07561493310617351, "learning_rate": 4.029356129329559e-06, "loss": 0.4594, "num_tokens": 10242061637.0, "step": 2452 }, { "epoch": 4.907588279489106, "grad_norm": 0.07339155725562935, "learning_rate": 4.028037265685394e-06, "loss": 0.4722, "num_tokens": 10246201847.0, "step": 2453 }, { "epoch": 4.909591785624843, "grad_norm": 0.07648691867276455, "learning_rate": 4.026748689613386e-06, "loss": 0.4687, "num_tokens": 10250396151.0, "step": 2454 }, { "epoch": 4.911595291760581, "grad_norm": 0.08098314693499724, "learning_rate": 4.025490403285135e-06, "loss": 0.4838, "num_tokens": 10254590455.0, "step": 2455 }, { "epoch": 4.913598797896318, "grad_norm": 0.08991209326765755, "learning_rate": 4.024262408821186e-06, "loss": 0.4567, "num_tokens": 10258784759.0, "step": 2456 }, { "epoch": 4.915602304032056, "grad_norm": 0.08088684737021298, "learning_rate": 4.0230647082910405e-06, "loss": 0.4555, "num_tokens": 10262978865.0, "step": 2457 }, { "epoch": 4.917605810167793, "grad_norm": 0.08032827378516078, "learning_rate": 4.021897303713146e-06, "loss": 0.4789, "num_tokens": 10267173169.0, "step": 2458 }, { "epoch": 4.919609316303531, "grad_norm": 0.07665394952227446, "learning_rate": 4.0207601970548895e-06, "loss": 0.4839, "num_tokens": 10271354454.0, "step": 2459 }, { "epoch": 4.921612822439268, "grad_norm": 0.09199692277743979, "learning_rate": 4.0196533902326015e-06, "loss": 0.4769, "num_tokens": 10275528868.0, "step": 2460 }, { "epoch": 4.923616328575006, "grad_norm": 0.07747131561501497, "learning_rate": 4.018576885111549e-06, "loss": 0.4776, "num_tokens": 10279723172.0, "step": 2461 }, { "epoch": 4.925619834710744, "grad_norm": 0.07999612394000676, "learning_rate": 4.017530683505932e-06, "loss": 0.4764, "num_tokens": 10283898079.0, "step": 2462 }, { "epoch": 4.927623340846481, "grad_norm": 0.08278085197877944, "learning_rate": 4.016514787178878e-06, "loss": 0.4665, "num_tokens": 10288087031.0, "step": 2463 }, { "epoch": 4.929626846982218, "grad_norm": 0.08539964262447788, "learning_rate": 4.0155291978424436e-06, "loss": 0.4758, "num_tokens": 10292263262.0, "step": 2464 }, { "epoch": 4.931630353117956, "grad_norm": 0.08188614664305172, "learning_rate": 4.0145739171576165e-06, "loss": 0.4665, "num_tokens": 10296432755.0, "step": 2465 }, { "epoch": 4.933633859253694, "grad_norm": 0.08539847198969494, "learning_rate": 4.013648946734296e-06, "loss": 0.4693, "num_tokens": 10300627059.0, "step": 2466 }, { "epoch": 4.935637365389431, "grad_norm": 0.0798027657644911, "learning_rate": 4.012754288131304e-06, "loss": 0.4782, "num_tokens": 10304772864.0, "step": 2467 }, { "epoch": 4.937640871525169, "grad_norm": 0.08014821816395609, "learning_rate": 4.011889942856385e-06, "loss": 0.4671, "num_tokens": 10308967168.0, "step": 2468 }, { "epoch": 4.9396443776609065, "grad_norm": 0.08292028879964522, "learning_rate": 4.011055912366187e-06, "loss": 0.4841, "num_tokens": 10313136859.0, "step": 2469 }, { "epoch": 4.941647883796644, "grad_norm": 0.08295573827135565, "learning_rate": 4.010252198066279e-06, "loss": 0.467, "num_tokens": 10317331163.0, "step": 2470 }, { "epoch": 4.9436513899323815, "grad_norm": 0.07899769643955437, "learning_rate": 4.009478801311132e-06, "loss": 0.4793, "num_tokens": 10321511628.0, "step": 2471 }, { "epoch": 4.9456548960681195, "grad_norm": 0.08001139890044474, "learning_rate": 4.008735723404128e-06, "loss": 0.4652, "num_tokens": 10325658261.0, "step": 2472 }, { "epoch": 4.9476584022038566, "grad_norm": 0.0838964656592363, "learning_rate": 4.008022965597554e-06, "loss": 0.4847, "num_tokens": 10329848889.0, "step": 2473 }, { "epoch": 4.9496619083395945, "grad_norm": 0.09118557683379001, "learning_rate": 4.007340529092594e-06, "loss": 0.4613, "num_tokens": 10334043193.0, "step": 2474 }, { "epoch": 4.951665414475332, "grad_norm": 0.07511354422176861, "learning_rate": 4.00668841503934e-06, "loss": 0.4608, "num_tokens": 10338237497.0, "step": 2475 }, { "epoch": 4.95366892061107, "grad_norm": 0.0781405680857428, "learning_rate": 4.006066624536777e-06, "loss": 0.4625, "num_tokens": 10342410183.0, "step": 2476 }, { "epoch": 4.955672426746807, "grad_norm": 0.0782254316653382, "learning_rate": 4.005475158632789e-06, "loss": 0.4694, "num_tokens": 10346573462.0, "step": 2477 }, { "epoch": 4.957675932882545, "grad_norm": 0.08294703113851087, "learning_rate": 4.004914018324154e-06, "loss": 0.471, "num_tokens": 10350767766.0, "step": 2478 }, { "epoch": 4.959679439018282, "grad_norm": 0.08936718410695714, "learning_rate": 4.004383204556544e-06, "loss": 0.4746, "num_tokens": 10354940512.0, "step": 2479 }, { "epoch": 4.96168294515402, "grad_norm": 0.07927805670045948, "learning_rate": 4.003882718224523e-06, "loss": 0.4712, "num_tokens": 10359123058.0, "step": 2480 }, { "epoch": 4.963686451289757, "grad_norm": 0.08186322238148677, "learning_rate": 4.003412560171543e-06, "loss": 0.473, "num_tokens": 10363314325.0, "step": 2481 }, { "epoch": 4.965689957425495, "grad_norm": 0.09924403785722011, "learning_rate": 4.0029727311899504e-06, "loss": 0.4712, "num_tokens": 10367496075.0, "step": 2482 }, { "epoch": 4.967693463561233, "grad_norm": 0.08288274593476702, "learning_rate": 4.00256323202097e-06, "loss": 0.4586, "num_tokens": 10371690379.0, "step": 2483 }, { "epoch": 4.96969696969697, "grad_norm": 0.08236002135532061, "learning_rate": 4.0021840633547195e-06, "loss": 0.4691, "num_tokens": 10375884683.0, "step": 2484 }, { "epoch": 4.971700475832707, "grad_norm": 0.08236422432573733, "learning_rate": 4.001835225830201e-06, "loss": 0.4722, "num_tokens": 10380065572.0, "step": 2485 }, { "epoch": 4.973703981968445, "grad_norm": 0.08463405452809444, "learning_rate": 4.001516720035297e-06, "loss": 0.4723, "num_tokens": 10384235268.0, "step": 2486 }, { "epoch": 4.975707488104183, "grad_norm": 0.0821747545467207, "learning_rate": 4.001228546506775e-06, "loss": 0.4757, "num_tokens": 10388384072.0, "step": 2487 }, { "epoch": 4.97771099423992, "grad_norm": 0.08894914640691608, "learning_rate": 4.0009707057302875e-06, "loss": 0.4676, "num_tokens": 10392570115.0, "step": 2488 }, { "epoch": 4.979714500375658, "grad_norm": 0.08390026321802917, "learning_rate": 4.0007431981403625e-06, "loss": 0.4739, "num_tokens": 10396747578.0, "step": 2489 }, { "epoch": 4.981718006511395, "grad_norm": 0.08560177851885656, "learning_rate": 4.000546024120413e-06, "loss": 0.4844, "num_tokens": 10400941882.0, "step": 2490 }, { "epoch": 4.983721512647133, "grad_norm": 0.07907485753388858, "learning_rate": 4.000379184002727e-06, "loss": 0.4641, "num_tokens": 10405106087.0, "step": 2491 }, { "epoch": 4.98572501878287, "grad_norm": 0.0870754160531539, "learning_rate": 4.000242678068479e-06, "loss": 0.4871, "num_tokens": 10409291398.0, "step": 2492 }, { "epoch": 4.987728524918608, "grad_norm": 0.08133993019801855, "learning_rate": 4.000136506547716e-06, "loss": 0.4763, "num_tokens": 10413485702.0, "step": 2493 }, { "epoch": 4.989732031054345, "grad_norm": 0.08208405226297869, "learning_rate": 4.000060669619365e-06, "loss": 0.4674, "num_tokens": 10417680006.0, "step": 2494 }, { "epoch": 4.991735537190083, "grad_norm": 0.07946149263704042, "learning_rate": 4.000015167411231e-06, "loss": 0.4827, "num_tokens": 10421874310.0, "step": 2495 }, { "epoch": 4.991735537190083, "step": 2495, "total_flos": 3.879283297703449e+20, "train_loss": 0.5371873642375808, "train_runtime": 251254.6794, "train_samples_per_second": 1.271, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 2495, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.879283297703449e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }