{ "best_metric": 0.439, "best_model_checkpoint": "runs/legis-llama3-1-8b-valid-arandu/checkpoint-1120", "epoch": 0.9995600527936648, "eval_steps": 5, "global_step": 1136, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004399472063352398, "grad_norm": 25.937191009521484, "learning_rate": 8.771929824561403e-06, "loss": 1.0992, "step": 5 }, { "epoch": 0.004399472063352398, "eval_loss": 1.1428982019424438, "eval_runtime": 29.8805, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.301, "step": 5 }, { "epoch": 0.008798944126704795, "grad_norm": 32.52676773071289, "learning_rate": 1.7543859649122806e-05, "loss": 1.067, "step": 10 }, { "epoch": 0.008798944126704795, "eval_loss": 1.0669578313827515, "eval_runtime": 28.5282, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 10 }, { "epoch": 0.013198416190057193, "grad_norm": 78.51001739501953, "learning_rate": 2.6315789473684212e-05, "loss": 1.0057, "step": 15 }, { "epoch": 0.013198416190057193, "eval_loss": 1.0462743043899536, "eval_runtime": 28.5697, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 15 }, { "epoch": 0.01759788825340959, "grad_norm": 21.255964279174805, "learning_rate": 3.508771929824561e-05, "loss": 0.9236, "step": 20 }, { "epoch": 0.01759788825340959, "eval_loss": 0.9604344367980957, "eval_runtime": 28.6152, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 20 }, { "epoch": 0.02199736031676199, "grad_norm": 1.3699233531951904, "learning_rate": 4.3859649122807014e-05, "loss": 0.8823, "step": 25 }, { "epoch": 0.02199736031676199, "eval_loss": 0.9002779126167297, "eval_runtime": 28.579, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 25 }, { "epoch": 0.026396832380114386, "grad_norm": 2.50810170173645, "learning_rate": 5.2631578947368424e-05, "loss": 0.8144, "step": 30 }, { "epoch": 0.026396832380114386, "eval_loss": 0.8441588878631592, "eval_runtime": 28.4936, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 30 }, { "epoch": 0.030796304443466784, "grad_norm": 1.6816316843032837, "learning_rate": 6.140350877192983e-05, "loss": 0.7829, "step": 35 }, { "epoch": 0.030796304443466784, "eval_loss": 0.7928382754325867, "eval_runtime": 28.5908, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 35 }, { "epoch": 0.03519577650681918, "grad_norm": 0.5125584006309509, "learning_rate": 7.017543859649122e-05, "loss": 0.7075, "step": 40 }, { "epoch": 0.03519577650681918, "eval_loss": 0.7538504600524902, "eval_runtime": 28.5816, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 40 }, { "epoch": 0.039595248570171576, "grad_norm": 0.36081045866012573, "learning_rate": 7.894736842105263e-05, "loss": 0.6776, "step": 45 }, { "epoch": 0.039595248570171576, "eval_loss": 0.7313268184661865, "eval_runtime": 28.6141, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 45 }, { "epoch": 0.04399472063352398, "grad_norm": 0.32318177819252014, "learning_rate": 8.771929824561403e-05, "loss": 0.6499, "step": 50 }, { "epoch": 0.04399472063352398, "eval_loss": 0.71351158618927, "eval_runtime": 28.5766, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 50 }, { "epoch": 0.04839419269687637, "grad_norm": 0.34377261996269226, "learning_rate": 9.649122807017544e-05, "loss": 0.6487, "step": 55 }, { "epoch": 0.04839419269687637, "eval_loss": 0.7006722092628479, "eval_runtime": 28.6048, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 55 }, { "epoch": 0.05279366476022877, "grad_norm": 0.4360629618167877, "learning_rate": 0.00010526315789473685, "loss": 0.6405, "step": 60 }, { "epoch": 0.05279366476022877, "eval_loss": 0.6905343532562256, "eval_runtime": 28.5257, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 60 }, { "epoch": 0.05719313682358117, "grad_norm": 0.28764936327934265, "learning_rate": 0.00011403508771929824, "loss": 0.6352, "step": 65 }, { "epoch": 0.05719313682358117, "eval_loss": 0.68143630027771, "eval_runtime": 28.6362, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 65 }, { "epoch": 0.06159260888693357, "grad_norm": 0.34088754653930664, "learning_rate": 0.00012280701754385965, "loss": 0.6064, "step": 70 }, { "epoch": 0.06159260888693357, "eval_loss": 0.6742813587188721, "eval_runtime": 28.5667, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 70 }, { "epoch": 0.06599208095028597, "grad_norm": 0.31284183263778687, "learning_rate": 0.00013157894736842108, "loss": 0.5924, "step": 75 }, { "epoch": 0.06599208095028597, "eval_loss": 0.6679767966270447, "eval_runtime": 28.461, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 75 }, { "epoch": 0.07039155301363836, "grad_norm": 0.30470508337020874, "learning_rate": 0.00014035087719298245, "loss": 0.5992, "step": 80 }, { "epoch": 0.07039155301363836, "eval_loss": 0.6631008386611938, "eval_runtime": 28.6891, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 80 }, { "epoch": 0.07479102507699076, "grad_norm": 0.3255262076854706, "learning_rate": 0.00014912280701754387, "loss": 0.5704, "step": 85 }, { "epoch": 0.07479102507699076, "eval_loss": 0.658618688583374, "eval_runtime": 28.6094, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 85 }, { "epoch": 0.07919049714034315, "grad_norm": 0.31922295689582825, "learning_rate": 0.00015789473684210527, "loss": 0.6048, "step": 90 }, { "epoch": 0.07919049714034315, "eval_loss": 0.6537344455718994, "eval_runtime": 28.532, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 90 }, { "epoch": 0.08358996920369556, "grad_norm": 0.45636337995529175, "learning_rate": 0.0001666666666666667, "loss": 0.613, "step": 95 }, { "epoch": 0.08358996920369556, "eval_loss": 0.6501972079277039, "eval_runtime": 28.6568, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 95 }, { "epoch": 0.08798944126704795, "grad_norm": 0.29334941506385803, "learning_rate": 0.00017543859649122806, "loss": 0.5799, "step": 100 }, { "epoch": 0.08798944126704795, "eval_loss": 0.6471393704414368, "eval_runtime": 28.5997, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 100 }, { "epoch": 0.09238891333040035, "grad_norm": 0.31318825483322144, "learning_rate": 0.00018421052631578948, "loss": 0.5887, "step": 105 }, { "epoch": 0.09238891333040035, "eval_loss": 0.6440868377685547, "eval_runtime": 28.6275, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 105 }, { "epoch": 0.09678838539375274, "grad_norm": 0.27908894419670105, "learning_rate": 0.00019298245614035088, "loss": 0.5905, "step": 110 }, { "epoch": 0.09678838539375274, "eval_loss": 0.6423875689506531, "eval_runtime": 28.5491, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 110 }, { "epoch": 0.10118785745710515, "grad_norm": 0.2715133726596832, "learning_rate": 0.00019999952753720356, "loss": 0.5902, "step": 115 }, { "epoch": 0.10118785745710515, "eval_loss": 0.6415910720825195, "eval_runtime": 28.5086, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 115 }, { "epoch": 0.10558732952045755, "grad_norm": 0.3028790056705475, "learning_rate": 0.000199982991808088, "loss": 0.5773, "step": 120 }, { "epoch": 0.10558732952045755, "eval_loss": 0.6377425789833069, "eval_runtime": 28.6438, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 120 }, { "epoch": 0.10998680158380994, "grad_norm": 0.3071883022785187, "learning_rate": 0.00019994283740338306, "loss": 0.5598, "step": 125 }, { "epoch": 0.10998680158380994, "eval_loss": 0.6367806196212769, "eval_runtime": 28.4852, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 125 }, { "epoch": 0.11438627364716233, "grad_norm": 0.34842655062675476, "learning_rate": 0.00019987907380864062, "loss": 0.596, "step": 130 }, { "epoch": 0.11438627364716233, "eval_loss": 0.6347749829292297, "eval_runtime": 28.5908, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 130 }, { "epoch": 0.11878574571051474, "grad_norm": 0.2854275107383728, "learning_rate": 0.00019979171608653924, "loss": 0.5733, "step": 135 }, { "epoch": 0.11878574571051474, "eval_loss": 0.6301032900810242, "eval_runtime": 28.5482, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 135 }, { "epoch": 0.12318521777386714, "grad_norm": 0.27615901827812195, "learning_rate": 0.00019968078487332566, "loss": 0.5875, "step": 140 }, { "epoch": 0.12318521777386714, "eval_loss": 0.6269793510437012, "eval_runtime": 28.4974, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 140 }, { "epoch": 0.12758468983721954, "grad_norm": 0.2709368169307709, "learning_rate": 0.00019954630637394029, "loss": 0.5711, "step": 145 }, { "epoch": 0.12758468983721954, "eval_loss": 0.6240233182907104, "eval_runtime": 28.5264, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 145 }, { "epoch": 0.13198416190057194, "grad_norm": 0.2877412736415863, "learning_rate": 0.00019938831235582672, "loss": 0.5885, "step": 150 }, { "epoch": 0.13198416190057194, "eval_loss": 0.6206945776939392, "eval_runtime": 28.5668, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 150 }, { "epoch": 0.13638363396392433, "grad_norm": 0.2922605574131012, "learning_rate": 0.00019920684014142738, "loss": 0.5485, "step": 155 }, { "epoch": 0.13638363396392433, "eval_loss": 0.6200662851333618, "eval_runtime": 28.5452, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 155 }, { "epoch": 0.14078310602727673, "grad_norm": 0.28340834379196167, "learning_rate": 0.00019900193259936704, "loss": 0.5754, "step": 160 }, { "epoch": 0.14078310602727673, "eval_loss": 0.6187402606010437, "eval_runtime": 28.5939, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 160 }, { "epoch": 0.14518257809062912, "grad_norm": 0.2796618938446045, "learning_rate": 0.0001987736381343261, "loss": 0.5535, "step": 165 }, { "epoch": 0.14518257809062912, "eval_loss": 0.6156266331672668, "eval_runtime": 28.5378, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 165 }, { "epoch": 0.14958205015398152, "grad_norm": 0.25343528389930725, "learning_rate": 0.00019852201067560606, "loss": 0.5697, "step": 170 }, { "epoch": 0.14958205015398152, "eval_loss": 0.6125033497810364, "eval_runtime": 28.5565, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 170 }, { "epoch": 0.1539815222173339, "grad_norm": 0.23438464105129242, "learning_rate": 0.00019824710966438996, "loss": 0.5335, "step": 175 }, { "epoch": 0.1539815222173339, "eval_loss": 0.6096713542938232, "eval_runtime": 28.6017, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 175 }, { "epoch": 0.1583809942806863, "grad_norm": 0.24729043245315552, "learning_rate": 0.00019794900003970077, "loss": 0.5702, "step": 180 }, { "epoch": 0.1583809942806863, "eval_loss": 0.6071114540100098, "eval_runtime": 28.5677, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 180 }, { "epoch": 0.16278046634403873, "grad_norm": 0.257964551448822, "learning_rate": 0.00019762775222306107, "loss": 0.5494, "step": 185 }, { "epoch": 0.16278046634403873, "eval_loss": 0.6062531471252441, "eval_runtime": 28.5933, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 185 }, { "epoch": 0.16717993840739112, "grad_norm": 0.2648680806159973, "learning_rate": 0.0001972834421018576, "loss": 0.5379, "step": 190 }, { "epoch": 0.16717993840739112, "eval_loss": 0.6054437756538391, "eval_runtime": 28.5575, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 190 }, { "epoch": 0.17157941047074352, "grad_norm": 0.2540712356567383, "learning_rate": 0.00019691615101141455, "loss": 0.5415, "step": 195 }, { "epoch": 0.17157941047074352, "eval_loss": 0.6023730039596558, "eval_runtime": 28.5419, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 195 }, { "epoch": 0.1759788825340959, "grad_norm": 0.2424851357936859, "learning_rate": 0.00019652596571578004, "loss": 0.5504, "step": 200 }, { "epoch": 0.1759788825340959, "eval_loss": 0.5997632145881653, "eval_runtime": 28.6422, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.1803783545974483, "grad_norm": 0.2573873698711395, "learning_rate": 0.0001961129783872301, "loss": 0.5418, "step": 205 }, { "epoch": 0.1803783545974483, "eval_loss": 0.5976300239562988, "eval_runtime": 28.5752, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 205 }, { "epoch": 0.1847778266608007, "grad_norm": 0.22338183224201202, "learning_rate": 0.00019567728658449504, "loss": 0.54, "step": 210 }, { "epoch": 0.1847778266608007, "eval_loss": 0.5960862040519714, "eval_runtime": 28.4685, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 210 }, { "epoch": 0.1891772987241531, "grad_norm": 0.2706097960472107, "learning_rate": 0.00019521899322971352, "loss": 0.5522, "step": 215 }, { "epoch": 0.1891772987241531, "eval_loss": 0.5958646535873413, "eval_runtime": 28.5678, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 215 }, { "epoch": 0.1935767707875055, "grad_norm": 0.23476411402225494, "learning_rate": 0.00019473820658411957, "loss": 0.5262, "step": 220 }, { "epoch": 0.1935767707875055, "eval_loss": 0.5945417284965515, "eval_runtime": 28.5611, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 220 }, { "epoch": 0.1979762428508579, "grad_norm": 0.23705659806728363, "learning_rate": 0.00019423504022246825, "loss": 0.5439, "step": 225 }, { "epoch": 0.1979762428508579, "eval_loss": 0.5934200286865234, "eval_runtime": 28.5955, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 225 }, { "epoch": 0.2023757149142103, "grad_norm": 0.22662319242954254, "learning_rate": 0.00019370961300620637, "loss": 0.5262, "step": 230 }, { "epoch": 0.2023757149142103, "eval_loss": 0.5928044319152832, "eval_runtime": 28.514, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 230 }, { "epoch": 0.2067751869775627, "grad_norm": 0.24046145379543304, "learning_rate": 0.00019316204905539425, "loss": 0.5462, "step": 235 }, { "epoch": 0.2067751869775627, "eval_loss": 0.5904839038848877, "eval_runtime": 28.5557, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 235 }, { "epoch": 0.2111746590409151, "grad_norm": 0.23923470079898834, "learning_rate": 0.000192592477719385, "loss": 0.5345, "step": 240 }, { "epoch": 0.2111746590409151, "eval_loss": 0.590508759021759, "eval_runtime": 28.5204, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 240 }, { "epoch": 0.21557413110426749, "grad_norm": 0.24345721304416656, "learning_rate": 0.00019200103354626892, "loss": 0.5478, "step": 245 }, { "epoch": 0.21557413110426749, "eval_loss": 0.5882726907730103, "eval_runtime": 28.5722, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 245 }, { "epoch": 0.21997360316761988, "grad_norm": 0.27501732110977173, "learning_rate": 0.00019138785625108957, "loss": 0.5607, "step": 250 }, { "epoch": 0.21997360316761988, "eval_loss": 0.5860432982444763, "eval_runtime": 28.503, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 250 }, { "epoch": 0.22437307523097227, "grad_norm": 0.3151032328605652, "learning_rate": 0.0001907530906828393, "loss": 0.5479, "step": 255 }, { "epoch": 0.22437307523097227, "eval_loss": 0.5846895575523376, "eval_runtime": 28.6081, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 255 }, { "epoch": 0.22877254729432467, "grad_norm": 0.2758755385875702, "learning_rate": 0.0001900968867902419, "loss": 0.5767, "step": 260 }, { "epoch": 0.22877254729432467, "eval_loss": 0.5815722942352295, "eval_runtime": 28.5574, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 260 }, { "epoch": 0.2331720193576771, "grad_norm": 0.25241315364837646, "learning_rate": 0.000189419399586331, "loss": 0.5568, "step": 265 }, { "epoch": 0.2331720193576771, "eval_loss": 0.5822274684906006, "eval_runtime": 28.573, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 265 }, { "epoch": 0.23757149142102948, "grad_norm": 0.316436767578125, "learning_rate": 0.00018872078911183146, "loss": 0.5385, "step": 270 }, { "epoch": 0.23757149142102948, "eval_loss": 0.5809066891670227, "eval_runtime": 28.5598, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 270 }, { "epoch": 0.24197096348438188, "grad_norm": 0.27813801169395447, "learning_rate": 0.00018800122039735358, "loss": 0.5348, "step": 275 }, { "epoch": 0.24197096348438188, "eval_loss": 0.5786107778549194, "eval_runtime": 28.546, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 275 }, { "epoch": 0.24637043554773427, "grad_norm": 0.2552705407142639, "learning_rate": 0.00018726086342440846, "loss": 0.5207, "step": 280 }, { "epoch": 0.24637043554773427, "eval_loss": 0.5768923759460449, "eval_runtime": 28.5995, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 280 }, { "epoch": 0.2507699076110867, "grad_norm": 0.21993091702461243, "learning_rate": 0.00018649989308525372, "loss": 0.5292, "step": 285 }, { "epoch": 0.2507699076110867, "eval_loss": 0.5762263536453247, "eval_runtime": 28.4816, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 285 }, { "epoch": 0.2551693796744391, "grad_norm": 0.27086153626441956, "learning_rate": 0.0001857184891415794, "loss": 0.5312, "step": 290 }, { "epoch": 0.2551693796744391, "eval_loss": 0.5758266448974609, "eval_runtime": 28.5295, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 290 }, { "epoch": 0.2595688517377915, "grad_norm": 0.21816319227218628, "learning_rate": 0.0001849168361820431, "loss": 0.5223, "step": 295 }, { "epoch": 0.2595688517377915, "eval_loss": 0.574447751045227, "eval_runtime": 28.5859, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 295 }, { "epoch": 0.2639683238011439, "grad_norm": 0.24796700477600098, "learning_rate": 0.00018409512357866548, "loss": 0.5485, "step": 300 }, { "epoch": 0.2639683238011439, "eval_loss": 0.573371410369873, "eval_runtime": 28.6178, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 300 }, { "epoch": 0.2683677958644963, "grad_norm": 0.2425287663936615, "learning_rate": 0.00018325354544209535, "loss": 0.5217, "step": 305 }, { "epoch": 0.2683677958644963, "eval_loss": 0.5723298788070679, "eval_runtime": 28.5916, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 305 }, { "epoch": 0.27276726792784867, "grad_norm": 0.21630050241947174, "learning_rate": 0.00018239230057575542, "loss": 0.5074, "step": 310 }, { "epoch": 0.27276726792784867, "eval_loss": 0.5725327134132385, "eval_runtime": 28.536, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 310 }, { "epoch": 0.27716673999120106, "grad_norm": 0.21529468894004822, "learning_rate": 0.0001815115924288798, "loss": 0.5487, "step": 315 }, { "epoch": 0.27716673999120106, "eval_loss": 0.5721793174743652, "eval_runtime": 28.6852, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 315 }, { "epoch": 0.28156621205455346, "grad_norm": 0.21623414754867554, "learning_rate": 0.00018061162904845358, "loss": 0.5106, "step": 320 }, { "epoch": 0.28156621205455346, "eval_loss": 0.5709577202796936, "eval_runtime": 28.4592, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 320 }, { "epoch": 0.28596568411790585, "grad_norm": 0.2219308316707611, "learning_rate": 0.0001796926230300667, "loss": 0.5218, "step": 325 }, { "epoch": 0.28596568411790585, "eval_loss": 0.5698617100715637, "eval_runtime": 28.5588, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 325 }, { "epoch": 0.29036515618125824, "grad_norm": 0.2264701873064041, "learning_rate": 0.00017875479146769305, "loss": 0.5162, "step": 330 }, { "epoch": 0.29036515618125824, "eval_loss": 0.5689781308174133, "eval_runtime": 28.6221, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 330 }, { "epoch": 0.29476462824461064, "grad_norm": 0.24004362523555756, "learning_rate": 0.000177798355902407, "loss": 0.539, "step": 335 }, { "epoch": 0.29476462824461064, "eval_loss": 0.5678241848945618, "eval_runtime": 28.5677, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 335 }, { "epoch": 0.29916410030796303, "grad_norm": 0.22996000945568085, "learning_rate": 0.00017682354227004963, "loss": 0.5002, "step": 340 }, { "epoch": 0.29916410030796303, "eval_loss": 0.5670127272605896, "eval_runtime": 28.6425, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 340 }, { "epoch": 0.3035635723713154, "grad_norm": 0.23163671791553497, "learning_rate": 0.00017583058084785625, "loss": 0.5175, "step": 345 }, { "epoch": 0.3035635723713154, "eval_loss": 0.5650352239608765, "eval_runtime": 28.5994, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 345 }, { "epoch": 0.3079630444346678, "grad_norm": 0.20120489597320557, "learning_rate": 0.00017481970620005912, "loss": 0.5269, "step": 350 }, { "epoch": 0.3079630444346678, "eval_loss": 0.5640237927436829, "eval_runtime": 28.5009, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 350 }, { "epoch": 0.3123625164980202, "grad_norm": 0.22231583297252655, "learning_rate": 0.00017379115712247675, "loss": 0.5444, "step": 355 }, { "epoch": 0.3123625164980202, "eval_loss": 0.5634257197380066, "eval_runtime": 28.5722, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 355 }, { "epoch": 0.3167619885613726, "grad_norm": 0.216331347823143, "learning_rate": 0.00017274517658610398, "loss": 0.5074, "step": 360 }, { "epoch": 0.3167619885613726, "eval_loss": 0.5618783831596375, "eval_runtime": 28.6759, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 360 }, { "epoch": 0.32116146062472506, "grad_norm": 0.21976010501384735, "learning_rate": 0.0001716820116797158, "loss": 0.5259, "step": 365 }, { "epoch": 0.32116146062472506, "eval_loss": 0.5602042078971863, "eval_runtime": 28.6019, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 365 }, { "epoch": 0.32556093268807745, "grad_norm": 0.22740119695663452, "learning_rate": 0.0001706019135514982, "loss": 0.5158, "step": 370 }, { "epoch": 0.32556093268807745, "eval_loss": 0.5599080920219421, "eval_runtime": 28.5177, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 370 }, { "epoch": 0.32996040475142985, "grad_norm": 0.21888501942157745, "learning_rate": 0.0001695051373497202, "loss": 0.527, "step": 375 }, { "epoch": 0.32996040475142985, "eval_loss": 0.558814525604248, "eval_runtime": 28.661, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 375 }, { "epoch": 0.33435987681478224, "grad_norm": 0.20402850210666656, "learning_rate": 0.00016839194216246108, "loss": 0.5027, "step": 380 }, { "epoch": 0.33435987681478224, "eval_loss": 0.5578404664993286, "eval_runtime": 28.5421, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 380 }, { "epoch": 0.33875934887813464, "grad_norm": 0.20368748903274536, "learning_rate": 0.00016726259095640664, "loss": 0.505, "step": 385 }, { "epoch": 0.33875934887813464, "eval_loss": 0.5567160844802856, "eval_runtime": 28.6126, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 385 }, { "epoch": 0.34315882094148703, "grad_norm": 0.2069130390882492, "learning_rate": 0.0001661173505147295, "loss": 0.5086, "step": 390 }, { "epoch": 0.34315882094148703, "eval_loss": 0.55617755651474, "eval_runtime": 28.4879, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 390 }, { "epoch": 0.3475582930048394, "grad_norm": 0.23644201457500458, "learning_rate": 0.00016495649137406772, "loss": 0.5412, "step": 395 }, { "epoch": 0.3475582930048394, "eval_loss": 0.5556927919387817, "eval_runtime": 28.6713, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 395 }, { "epoch": 0.3519577650681918, "grad_norm": 0.21997737884521484, "learning_rate": 0.00016378028776061667, "loss": 0.4908, "step": 400 }, { "epoch": 0.3519577650681918, "eval_loss": 0.5555915832519531, "eval_runtime": 28.596, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 400 }, { "epoch": 0.3563572371315442, "grad_norm": 0.22075805068016052, "learning_rate": 0.00016258901752534948, "loss": 0.5155, "step": 405 }, { "epoch": 0.3563572371315442, "eval_loss": 0.5552019476890564, "eval_runtime": 28.595, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 405 }, { "epoch": 0.3607567091948966, "grad_norm": 0.5917304158210754, "learning_rate": 0.00016138296207838127, "loss": 0.4991, "step": 410 }, { "epoch": 0.3607567091948966, "eval_loss": 0.5550567507743835, "eval_runtime": 28.6222, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 410 }, { "epoch": 0.365156181258249, "grad_norm": 0.21421152353286743, "learning_rate": 0.00016016240632249224, "loss": 0.4769, "step": 415 }, { "epoch": 0.365156181258249, "eval_loss": 0.5548796653747559, "eval_runtime": 28.5933, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 415 }, { "epoch": 0.3695556533216014, "grad_norm": 0.201774463057518, "learning_rate": 0.0001589276385858262, "loss": 0.4914, "step": 420 }, { "epoch": 0.3695556533216014, "eval_loss": 0.5546624064445496, "eval_runtime": 28.5213, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 420 }, { "epoch": 0.3739551253849538, "grad_norm": 0.22172759473323822, "learning_rate": 0.0001576789505537795, "loss": 0.4726, "step": 425 }, { "epoch": 0.3739551253849538, "eval_loss": 0.5535080432891846, "eval_runtime": 28.6645, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 425 }, { "epoch": 0.3783545974483062, "grad_norm": 0.23269815742969513, "learning_rate": 0.00015641663720009733, "loss": 0.5076, "step": 430 }, { "epoch": 0.3783545974483062, "eval_loss": 0.5522862076759338, "eval_runtime": 28.5697, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 430 }, { "epoch": 0.3827540695116586, "grad_norm": 0.23303498327732086, "learning_rate": 0.00015514099671719268, "loss": 0.5064, "step": 435 }, { "epoch": 0.3827540695116586, "eval_loss": 0.5502522587776184, "eval_runtime": 28.5369, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 435 }, { "epoch": 0.387153541575011, "grad_norm": 0.24087387323379517, "learning_rate": 0.00015385233044570555, "loss": 0.5361, "step": 440 }, { "epoch": 0.387153541575011, "eval_loss": 0.5471201539039612, "eval_runtime": 28.5791, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 440 }, { "epoch": 0.3915530136383634, "grad_norm": 0.20800553262233734, "learning_rate": 0.00015255094280331797, "loss": 0.5169, "step": 445 }, { "epoch": 0.3915530136383634, "eval_loss": 0.5466722846031189, "eval_runtime": 28.6339, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 445 }, { "epoch": 0.3959524857017158, "grad_norm": 0.37092360854148865, "learning_rate": 0.0001512371412128424, "loss": 0.5362, "step": 450 }, { "epoch": 0.3959524857017158, "eval_loss": 0.5455148220062256, "eval_runtime": 28.637, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 450 }, { "epoch": 0.4003519577650682, "grad_norm": 0.20706337690353394, "learning_rate": 0.00014991123602960018, "loss": 0.4994, "step": 455 }, { "epoch": 0.4003519577650682, "eval_loss": 0.5440109968185425, "eval_runtime": 28.5672, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 455 }, { "epoch": 0.4047514298284206, "grad_norm": 0.2135256677865982, "learning_rate": 0.00014857354046810732, "loss": 0.5005, "step": 460 }, { "epoch": 0.4047514298284206, "eval_loss": 0.5431147813796997, "eval_runtime": 28.4835, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 460 }, { "epoch": 0.409150901891773, "grad_norm": 0.5737074613571167, "learning_rate": 0.00014722437052808472, "loss": 0.5208, "step": 465 }, { "epoch": 0.409150901891773, "eval_loss": 0.541969358921051, "eval_runtime": 28.6004, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 465 }, { "epoch": 0.4135503739551254, "grad_norm": 0.24099959433078766, "learning_rate": 0.00014586404491981052, "loss": 0.5074, "step": 470 }, { "epoch": 0.4135503739551254, "eval_loss": 0.5449388027191162, "eval_runtime": 28.658, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 470 }, { "epoch": 0.4179498460184778, "grad_norm": 0.2046642154455185, "learning_rate": 0.0001444928849888321, "loss": 0.5052, "step": 475 }, { "epoch": 0.4179498460184778, "eval_loss": 0.5407991409301758, "eval_runtime": 28.5688, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 475 }, { "epoch": 0.4223493180818302, "grad_norm": 0.2824171185493469, "learning_rate": 0.00014311121464005583, "loss": 0.5179, "step": 480 }, { "epoch": 0.4223493180818302, "eval_loss": 0.54000324010849, "eval_runtime": 28.7144, "eval_samples_per_second": 0.592, "eval_steps_per_second": 0.313, "step": 480 }, { "epoch": 0.4267487901451826, "grad_norm": 0.2045980840921402, "learning_rate": 0.00014171936026123168, "loss": 0.4634, "step": 485 }, { "epoch": 0.4267487901451826, "eval_loss": 0.5398800373077393, "eval_runtime": 28.5209, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 485 }, { "epoch": 0.43114826220853497, "grad_norm": 0.2092169225215912, "learning_rate": 0.00014031765064585197, "loss": 0.4802, "step": 490 }, { "epoch": 0.43114826220853497, "eval_loss": 0.5395181179046631, "eval_runtime": 28.5086, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 490 }, { "epoch": 0.43554773427188737, "grad_norm": 0.20700140297412872, "learning_rate": 0.00013890641691548114, "loss": 0.4962, "step": 495 }, { "epoch": 0.43554773427188737, "eval_loss": 0.5390854477882385, "eval_runtime": 28.5682, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 495 }, { "epoch": 0.43994720633523976, "grad_norm": 0.19903522729873657, "learning_rate": 0.00013748599244153633, "loss": 0.4841, "step": 500 }, { "epoch": 0.43994720633523976, "eval_loss": 0.5381758213043213, "eval_runtime": 29.4274, "eval_samples_per_second": 0.578, "eval_steps_per_second": 0.306, "step": 500 }, { "epoch": 0.44434667839859215, "grad_norm": 0.4766729474067688, "learning_rate": 0.00013605671276653567, "loss": 0.5252, "step": 505 }, { "epoch": 0.44434667839859215, "eval_loss": 0.5368968844413757, "eval_runtime": 28.6474, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 505 }, { "epoch": 0.44874615046194455, "grad_norm": 0.21688155829906464, "learning_rate": 0.00013461891552483444, "loss": 0.515, "step": 510 }, { "epoch": 0.44874615046194455, "eval_loss": 0.5366407036781311, "eval_runtime": 28.5352, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 510 }, { "epoch": 0.45314562252529694, "grad_norm": 0.20375116169452667, "learning_rate": 0.00013317294036286644, "loss": 0.4887, "step": 515 }, { "epoch": 0.45314562252529694, "eval_loss": 0.5360764861106873, "eval_runtime": 28.6533, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 515 }, { "epoch": 0.45754509458864934, "grad_norm": 0.1958196461200714, "learning_rate": 0.00013171912885891063, "loss": 0.4868, "step": 520 }, { "epoch": 0.45754509458864934, "eval_loss": 0.5356424450874329, "eval_runtime": 28.5027, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 520 }, { "epoch": 0.4619445666520018, "grad_norm": 0.22040507197380066, "learning_rate": 0.00013025782444240087, "loss": 0.5086, "step": 525 }, { "epoch": 0.4619445666520018, "eval_loss": 0.5351347327232361, "eval_runtime": 28.6428, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 525 }, { "epoch": 0.4663440387153542, "grad_norm": 0.19495758414268494, "learning_rate": 0.00012878937231279892, "loss": 0.5113, "step": 530 }, { "epoch": 0.4663440387153542, "eval_loss": 0.5347647070884705, "eval_runtime": 28.6252, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 530 }, { "epoch": 0.4707435107787066, "grad_norm": 0.21149738132953644, "learning_rate": 0.0001273141193580488, "loss": 0.483, "step": 535 }, { "epoch": 0.4707435107787066, "eval_loss": 0.5339221954345703, "eval_runtime": 28.6055, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 535 }, { "epoch": 0.47514298284205897, "grad_norm": 0.20391018688678741, "learning_rate": 0.0001258324140726326, "loss": 0.4728, "step": 540 }, { "epoch": 0.47514298284205897, "eval_loss": 0.5337977409362793, "eval_runtime": 28.5842, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 540 }, { "epoch": 0.47954245490541136, "grad_norm": 0.20913545787334442, "learning_rate": 0.00012434460647524676, "loss": 0.5016, "step": 545 }, { "epoch": 0.47954245490541136, "eval_loss": 0.532899022102356, "eval_runtime": 28.4759, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 545 }, { "epoch": 0.48394192696876376, "grad_norm": 0.19410260021686554, "learning_rate": 0.00012285104802611812, "loss": 0.5103, "step": 550 }, { "epoch": 0.48394192696876376, "eval_loss": 0.5321294665336609, "eval_runtime": 28.5662, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 550 }, { "epoch": 0.48834139903211615, "grad_norm": 0.2097245752811432, "learning_rate": 0.00012135209154397962, "loss": 0.4954, "step": 555 }, { "epoch": 0.48834139903211615, "eval_loss": 0.532034695148468, "eval_runtime": 28.652, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 555 }, { "epoch": 0.49274087109546855, "grad_norm": 0.21518121659755707, "learning_rate": 0.00011984809112272495, "loss": 0.4999, "step": 560 }, { "epoch": 0.49274087109546855, "eval_loss": 0.5313233733177185, "eval_runtime": 28.5662, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 560 }, { "epoch": 0.49714034315882094, "grad_norm": 0.19571034610271454, "learning_rate": 0.00011833940204776209, "loss": 0.4931, "step": 565 }, { "epoch": 0.49714034315882094, "eval_loss": 0.5311394333839417, "eval_runtime": 28.5352, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 565 }, { "epoch": 0.5015398152221734, "grad_norm": 0.20554794371128082, "learning_rate": 0.00011682638071208533, "loss": 0.4833, "step": 570 }, { "epoch": 0.5015398152221734, "eval_loss": 0.5300410389900208, "eval_runtime": 28.5679, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 570 }, { "epoch": 0.5059392872855257, "grad_norm": 0.20373423397541046, "learning_rate": 0.00011530938453208559, "loss": 0.5057, "step": 575 }, { "epoch": 0.5059392872855257, "eval_loss": 0.5300309658050537, "eval_runtime": 28.5821, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 575 }, { "epoch": 0.5103387593488782, "grad_norm": 0.1982477903366089, "learning_rate": 0.00011378877186311912, "loss": 0.4754, "step": 580 }, { "epoch": 0.5103387593488782, "eval_loss": 0.5292160511016846, "eval_runtime": 28.5256, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 580 }, { "epoch": 0.5147382314122305, "grad_norm": 0.20576219260692596, "learning_rate": 0.00011226490191485421, "loss": 0.4991, "step": 585 }, { "epoch": 0.5147382314122305, "eval_loss": 0.5280917882919312, "eval_runtime": 28.6835, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 585 }, { "epoch": 0.519137703475583, "grad_norm": 0.2154638022184372, "learning_rate": 0.00011073813466641632, "loss": 0.4811, "step": 590 }, { "epoch": 0.519137703475583, "eval_loss": 0.5274674296379089, "eval_runtime": 28.4766, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 590 }, { "epoch": 0.5235371755389353, "grad_norm": 0.2037007063627243, "learning_rate": 0.00010920883078135117, "loss": 0.4717, "step": 595 }, { "epoch": 0.5235371755389353, "eval_loss": 0.5270927548408508, "eval_runtime": 28.5377, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 595 }, { "epoch": 0.5279366476022878, "grad_norm": 0.21386198699474335, "learning_rate": 0.00010767735152242649, "loss": 0.4776, "step": 600 }, { "epoch": 0.5279366476022878, "eval_loss": 0.526791512966156, "eval_runtime": 28.596, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 600 }, { "epoch": 0.5323361196656401, "grad_norm": 0.1984720528125763, "learning_rate": 0.0001061440586662917, "loss": 0.4708, "step": 605 }, { "epoch": 0.5323361196656401, "eval_loss": 0.5266034007072449, "eval_runtime": 28.6491, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 605 }, { "epoch": 0.5367355917289925, "grad_norm": 0.19453096389770508, "learning_rate": 0.000104609314418017, "loss": 0.4659, "step": 610 }, { "epoch": 0.5367355917289925, "eval_loss": 0.5267328023910522, "eval_runtime": 28.6358, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 610 }, { "epoch": 0.5411350637923449, "grad_norm": 0.2048104703426361, "learning_rate": 0.00010307348132553025, "loss": 0.5138, "step": 615 }, { "epoch": 0.5411350637923449, "eval_loss": 0.5270944833755493, "eval_runtime": 28.5902, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 615 }, { "epoch": 0.5455345358556973, "grad_norm": 0.1899915337562561, "learning_rate": 0.00010153692219397387, "loss": 0.4797, "step": 620 }, { "epoch": 0.5455345358556973, "eval_loss": 0.5260502099990845, "eval_runtime": 28.5533, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 620 }, { "epoch": 0.5499340079190497, "grad_norm": 0.18520919978618622, "learning_rate": 0.0001, "loss": 0.5068, "step": 625 }, { "epoch": 0.5499340079190497, "eval_loss": 0.5251287817955017, "eval_runtime": 28.4846, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 625 }, { "epoch": 0.5543334799824021, "grad_norm": 0.21325986087322235, "learning_rate": 9.84630778060262e-05, "loss": 0.4799, "step": 630 }, { "epoch": 0.5543334799824021, "eval_loss": 0.524385929107666, "eval_runtime": 28.5917, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 630 }, { "epoch": 0.5587329520457545, "grad_norm": 0.20572926104068756, "learning_rate": 9.692651867446973e-05, "loss": 0.49, "step": 635 }, { "epoch": 0.5587329520457545, "eval_loss": 0.523975133895874, "eval_runtime": 28.6052, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 635 }, { "epoch": 0.5631324241091069, "grad_norm": 0.20347937941551208, "learning_rate": 9.539068558198304e-05, "loss": 0.4702, "step": 640 }, { "epoch": 0.5631324241091069, "eval_loss": 0.5229539275169373, "eval_runtime": 28.6223, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 640 }, { "epoch": 0.5675318961724594, "grad_norm": 0.21256154775619507, "learning_rate": 9.38559413337083e-05, "loss": 0.4736, "step": 645 }, { "epoch": 0.5675318961724594, "eval_loss": 0.5221072435379028, "eval_runtime": 28.6189, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 645 }, { "epoch": 0.5719313682358117, "grad_norm": 0.2260565459728241, "learning_rate": 9.232264847757357e-05, "loss": 0.5065, "step": 650 }, { "epoch": 0.5719313682358117, "eval_loss": 0.5213314890861511, "eval_runtime": 28.6771, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 650 }, { "epoch": 0.5763308402991641, "grad_norm": 0.21002529561519623, "learning_rate": 9.079116921864884e-05, "loss": 0.4796, "step": 655 }, { "epoch": 0.5763308402991641, "eval_loss": 0.5214037299156189, "eval_runtime": 28.6202, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 655 }, { "epoch": 0.5807303123625165, "grad_norm": 0.19340470433235168, "learning_rate": 8.92618653335837e-05, "loss": 0.4788, "step": 660 }, { "epoch": 0.5807303123625165, "eval_loss": 0.5211138725280762, "eval_runtime": 28.6313, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 660 }, { "epoch": 0.5851297844258689, "grad_norm": 0.19035720825195312, "learning_rate": 8.773509808514581e-05, "loss": 0.468, "step": 665 }, { "epoch": 0.5851297844258689, "eval_loss": 0.5191999077796936, "eval_runtime": 28.0607, "eval_samples_per_second": 0.606, "eval_steps_per_second": 0.321, "step": 665 }, { "epoch": 0.5895292564892213, "grad_norm": 0.19168096780776978, "learning_rate": 8.62112281368809e-05, "loss": 0.5066, "step": 670 }, { "epoch": 0.5895292564892213, "eval_loss": 0.5176913142204285, "eval_runtime": 28.5375, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 670 }, { "epoch": 0.5939287285525737, "grad_norm": 0.19758321344852448, "learning_rate": 8.469061546791442e-05, "loss": 0.51, "step": 675 }, { "epoch": 0.5939287285525737, "eval_loss": 0.517296314239502, "eval_runtime": 28.5712, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 675 }, { "epoch": 0.5983282006159261, "grad_norm": 0.19562241435050964, "learning_rate": 8.317361928791469e-05, "loss": 0.4932, "step": 680 }, { "epoch": 0.5983282006159261, "eval_loss": 0.5170657634735107, "eval_runtime": 28.4877, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 680 }, { "epoch": 0.6027276726792785, "grad_norm": 0.18590031564235687, "learning_rate": 8.166059795223794e-05, "loss": 0.5055, "step": 685 }, { "epoch": 0.6027276726792785, "eval_loss": 0.5166193842887878, "eval_runtime": 28.625, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 685 }, { "epoch": 0.6071271447426309, "grad_norm": 0.2049984484910965, "learning_rate": 8.015190887727509e-05, "loss": 0.4846, "step": 690 }, { "epoch": 0.6071271447426309, "eval_loss": 0.5160765647888184, "eval_runtime": 28.5582, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 690 }, { "epoch": 0.6115266168059833, "grad_norm": 0.19373777508735657, "learning_rate": 7.864790845602039e-05, "loss": 0.4862, "step": 695 }, { "epoch": 0.6115266168059833, "eval_loss": 0.5157306790351868, "eval_runtime": 28.6078, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 695 }, { "epoch": 0.6159260888693356, "grad_norm": 0.20326727628707886, "learning_rate": 7.714895197388189e-05, "loss": 0.5064, "step": 700 }, { "epoch": 0.6159260888693356, "eval_loss": 0.5153770446777344, "eval_runtime": 28.6597, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 700 }, { "epoch": 0.6203255609326881, "grad_norm": 0.19425565004348755, "learning_rate": 7.565539352475326e-05, "loss": 0.5018, "step": 705 }, { "epoch": 0.6203255609326881, "eval_loss": 0.5147074460983276, "eval_runtime": 28.5261, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 705 }, { "epoch": 0.6247250329960404, "grad_norm": 0.19491039216518402, "learning_rate": 7.416758592736744e-05, "loss": 0.482, "step": 710 }, { "epoch": 0.6247250329960404, "eval_loss": 0.5144516229629517, "eval_runtime": 28.533, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 710 }, { "epoch": 0.6291245050593929, "grad_norm": 0.1957363337278366, "learning_rate": 7.268588064195122e-05, "loss": 0.4883, "step": 715 }, { "epoch": 0.6291245050593929, "eval_loss": 0.5139791965484619, "eval_runtime": 28.5313, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 715 }, { "epoch": 0.6335239771227452, "grad_norm": 0.21253836154937744, "learning_rate": 7.12106276872011e-05, "loss": 0.4768, "step": 720 }, { "epoch": 0.6335239771227452, "eval_loss": 0.5137556195259094, "eval_runtime": 28.6307, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 720 }, { "epoch": 0.6379234491860977, "grad_norm": 0.1721029132604599, "learning_rate": 6.974217555759915e-05, "loss": 0.4816, "step": 725 }, { "epoch": 0.6379234491860977, "eval_loss": 0.5133811831474304, "eval_runtime": 28.5925, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 725 }, { "epoch": 0.6423229212494501, "grad_norm": 0.19211679697036743, "learning_rate": 6.82808711410894e-05, "loss": 0.5035, "step": 730 }, { "epoch": 0.6423229212494501, "eval_loss": 0.5132091641426086, "eval_runtime": 28.5078, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 730 }, { "epoch": 0.6467223933128025, "grad_norm": 0.19252945482730865, "learning_rate": 6.682705963713356e-05, "loss": 0.4822, "step": 735 }, { "epoch": 0.6467223933128025, "eval_loss": 0.5131357908248901, "eval_runtime": 28.6326, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 735 }, { "epoch": 0.6511218653761549, "grad_norm": 0.1986207813024521, "learning_rate": 6.538108447516558e-05, "loss": 0.4612, "step": 740 }, { "epoch": 0.6511218653761549, "eval_loss": 0.5128303170204163, "eval_runtime": 28.6066, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 740 }, { "epoch": 0.6555213374395072, "grad_norm": 0.19202682375907898, "learning_rate": 6.394328723346434e-05, "loss": 0.4578, "step": 745 }, { "epoch": 0.6555213374395072, "eval_loss": 0.5124692916870117, "eval_runtime": 28.6064, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 745 }, { "epoch": 0.6599208095028597, "grad_norm": 0.198526531457901, "learning_rate": 6.251400755846372e-05, "loss": 0.5176, "step": 750 }, { "epoch": 0.6599208095028597, "eval_loss": 0.5121349096298218, "eval_runtime": 28.5313, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 750 }, { "epoch": 0.664320281566212, "grad_norm": 0.19058994948863983, "learning_rate": 6.109358308451885e-05, "loss": 0.4877, "step": 755 }, { "epoch": 0.664320281566212, "eval_loss": 0.5118634700775146, "eval_runtime": 28.5287, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 755 }, { "epoch": 0.6687197536295645, "grad_norm": 0.1798192411661148, "learning_rate": 5.968234935414807e-05, "loss": 0.4805, "step": 760 }, { "epoch": 0.6687197536295645, "eval_loss": 0.5116167664527893, "eval_runtime": 28.5918, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 760 }, { "epoch": 0.6731192256929168, "grad_norm": 0.18448549509048462, "learning_rate": 5.828063973876834e-05, "loss": 0.4993, "step": 765 }, { "epoch": 0.6731192256929168, "eval_loss": 0.5111361742019653, "eval_runtime": 28.5586, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 765 }, { "epoch": 0.6775186977562693, "grad_norm": 0.18624383211135864, "learning_rate": 5.688878535994421e-05, "loss": 0.4844, "step": 770 }, { "epoch": 0.6775186977562693, "eval_loss": 0.5107051134109497, "eval_runtime": 28.5748, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 770 }, { "epoch": 0.6819181698196216, "grad_norm": 0.18364666402339935, "learning_rate": 5.550711501116789e-05, "loss": 0.4674, "step": 775 }, { "epoch": 0.6819181698196216, "eval_loss": 0.5101103186607361, "eval_runtime": 28.5159, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 775 }, { "epoch": 0.6863176418829741, "grad_norm": 0.23952247202396393, "learning_rate": 5.413595508018952e-05, "loss": 0.4943, "step": 780 }, { "epoch": 0.6863176418829741, "eval_loss": 0.5096238255500793, "eval_runtime": 28.516, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 780 }, { "epoch": 0.6907171139463264, "grad_norm": 0.20105206966400146, "learning_rate": 5.27756294719153e-05, "loss": 0.4924, "step": 785 }, { "epoch": 0.6907171139463264, "eval_loss": 0.5093135237693787, "eval_runtime": 28.5941, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 785 }, { "epoch": 0.6951165860096788, "grad_norm": 0.19826586544513702, "learning_rate": 5.1426459531892714e-05, "loss": 0.4986, "step": 790 }, { "epoch": 0.6951165860096788, "eval_loss": 0.5086015462875366, "eval_runtime": 28.6207, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 790 }, { "epoch": 0.6995160580730312, "grad_norm": 0.17991924285888672, "learning_rate": 5.008876397039983e-05, "loss": 0.4698, "step": 795 }, { "epoch": 0.6995160580730312, "eval_loss": 0.5082879662513733, "eval_runtime": 28.6587, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 795 }, { "epoch": 0.7039155301363836, "grad_norm": 0.19232523441314697, "learning_rate": 4.876285878715764e-05, "loss": 0.4981, "step": 800 }, { "epoch": 0.7039155301363836, "eval_loss": 0.5078893899917603, "eval_runtime": 28.5038, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 800 }, { "epoch": 0.708315002199736, "grad_norm": 0.19006720185279846, "learning_rate": 4.744905719668207e-05, "loss": 0.4758, "step": 805 }, { "epoch": 0.708315002199736, "eval_loss": 0.5076141357421875, "eval_runtime": 28.6324, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 805 }, { "epoch": 0.7127144742630884, "grad_norm": 0.19002890586853027, "learning_rate": 4.614766955429447e-05, "loss": 0.4642, "step": 810 }, { "epoch": 0.7127144742630884, "eval_loss": 0.507789671421051, "eval_runtime": 28.6356, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 810 }, { "epoch": 0.7171139463264409, "grad_norm": 0.2051495909690857, "learning_rate": 4.485900328280731e-05, "loss": 0.4669, "step": 815 }, { "epoch": 0.7171139463264409, "eval_loss": 0.5073484182357788, "eval_runtime": 28.5748, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 815 }, { "epoch": 0.7215134183897932, "grad_norm": 0.6378114223480225, "learning_rate": 4.358336279990268e-05, "loss": 0.4711, "step": 820 }, { "epoch": 0.7215134183897932, "eval_loss": 0.5070581436157227, "eval_runtime": 28.6233, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 820 }, { "epoch": 0.7259128904531457, "grad_norm": 0.181978240609169, "learning_rate": 4.2321049446220505e-05, "loss": 0.4704, "step": 825 }, { "epoch": 0.7259128904531457, "eval_loss": 0.5068845748901367, "eval_runtime": 28.5225, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 825 }, { "epoch": 0.730312362516498, "grad_norm": 0.1777966171503067, "learning_rate": 4.107236141417382e-05, "loss": 0.4752, "step": 830 }, { "epoch": 0.730312362516498, "eval_loss": 0.5066249966621399, "eval_runtime": 28.5423, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 830 }, { "epoch": 0.7347118345798505, "grad_norm": 0.18686190247535706, "learning_rate": 3.9837593677507726e-05, "loss": 0.4621, "step": 835 }, { "epoch": 0.7347118345798505, "eval_loss": 0.5066962242126465, "eval_runtime": 28.428, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.317, "step": 835 }, { "epoch": 0.7391113066432028, "grad_norm": 0.18854567408561707, "learning_rate": 3.8617037921618705e-05, "loss": 0.4748, "step": 840 }, { "epoch": 0.7391113066432028, "eval_loss": 0.50632643699646, "eval_runtime": 28.5075, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 840 }, { "epoch": 0.7435107787065552, "grad_norm": 0.19204109907150269, "learning_rate": 3.741098247465049e-05, "loss": 0.4948, "step": 845 }, { "epoch": 0.7435107787065552, "eval_loss": 0.5060507655143738, "eval_runtime": 28.5753, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 845 }, { "epoch": 0.7479102507699076, "grad_norm": 0.19182614982128143, "learning_rate": 3.621971223938334e-05, "loss": 0.4832, "step": 850 }, { "epoch": 0.7479102507699076, "eval_loss": 0.5058286190032959, "eval_runtime": 28.5184, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 850 }, { "epoch": 0.75230972283326, "grad_norm": 0.18205444514751434, "learning_rate": 3.504350862593231e-05, "loss": 0.4642, "step": 855 }, { "epoch": 0.75230972283326, "eval_loss": 0.505698025226593, "eval_runtime": 28.6382, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 855 }, { "epoch": 0.7567091948966124, "grad_norm": 0.20196740329265594, "learning_rate": 3.388264948527052e-05, "loss": 0.4877, "step": 860 }, { "epoch": 0.7567091948966124, "eval_loss": 0.5052359700202942, "eval_runtime": 28.5347, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 860 }, { "epoch": 0.7611086669599648, "grad_norm": 0.18125030398368835, "learning_rate": 3.2737409043593405e-05, "loss": 0.4727, "step": 865 }, { "epoch": 0.7611086669599648, "eval_loss": 0.504954993724823, "eval_runtime": 28.5976, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 865 }, { "epoch": 0.7655081390233172, "grad_norm": 0.18927669525146484, "learning_rate": 3.160805783753897e-05, "loss": 0.4691, "step": 870 }, { "epoch": 0.7655081390233172, "eval_loss": 0.5047942399978638, "eval_runtime": 28.5051, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 870 }, { "epoch": 0.7699076110866696, "grad_norm": 0.18508534133434296, "learning_rate": 3.0494862650279822e-05, "loss": 0.5292, "step": 875 }, { "epoch": 0.7699076110866696, "eval_loss": 0.5046341419219971, "eval_runtime": 28.5445, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 875 }, { "epoch": 0.774307083150022, "grad_norm": 0.18230414390563965, "learning_rate": 2.939808644850184e-05, "loss": 0.4708, "step": 880 }, { "epoch": 0.774307083150022, "eval_loss": 0.5046290755271912, "eval_runtime": 28.6138, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 880 }, { "epoch": 0.7787065552133744, "grad_norm": 0.17352643609046936, "learning_rate": 2.8317988320284228e-05, "loss": 0.4863, "step": 885 }, { "epoch": 0.7787065552133744, "eval_loss": 0.5044691562652588, "eval_runtime": 28.6321, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 885 }, { "epoch": 0.7831060272767268, "grad_norm": 0.1845002919435501, "learning_rate": 2.7254823413896058e-05, "loss": 0.5006, "step": 890 }, { "epoch": 0.7831060272767268, "eval_loss": 0.5042091012001038, "eval_runtime": 28.6132, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 890 }, { "epoch": 0.7875054993400792, "grad_norm": 0.17883773148059845, "learning_rate": 2.6208842877523278e-05, "loss": 0.4887, "step": 895 }, { "epoch": 0.7875054993400792, "eval_loss": 0.5039156675338745, "eval_runtime": 28.5693, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 895 }, { "epoch": 0.7919049714034316, "grad_norm": 0.19202597439289093, "learning_rate": 2.518029379994089e-05, "loss": 0.4867, "step": 900 }, { "epoch": 0.7919049714034316, "eval_loss": 0.5037320852279663, "eval_runtime": 28.549, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 900 }, { "epoch": 0.796304443466784, "grad_norm": 0.18246056139469147, "learning_rate": 2.4169419152143768e-05, "loss": 0.4662, "step": 905 }, { "epoch": 0.796304443466784, "eval_loss": 0.5035374164581299, "eval_runtime": 28.6042, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 905 }, { "epoch": 0.8007039155301364, "grad_norm": 0.18989378213882446, "learning_rate": 2.317645772995042e-05, "loss": 0.4744, "step": 910 }, { "epoch": 0.8007039155301364, "eval_loss": 0.5033923387527466, "eval_runtime": 28.4795, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.316, "step": 910 }, { "epoch": 0.8051033875934888, "grad_norm": 0.19525018334388733, "learning_rate": 2.220164409759299e-05, "loss": 0.5159, "step": 915 }, { "epoch": 0.8051033875934888, "eval_loss": 0.503151535987854, "eval_runtime": 28.6198, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 915 }, { "epoch": 0.8095028596568412, "grad_norm": 0.18840977549552917, "learning_rate": 2.124520853230697e-05, "loss": 0.4848, "step": 920 }, { "epoch": 0.8095028596568412, "eval_loss": 0.5029481649398804, "eval_runtime": 28.614, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 920 }, { "epoch": 0.8139023317201936, "grad_norm": 0.18055056035518646, "learning_rate": 2.03073769699333e-05, "loss": 0.4648, "step": 925 }, { "epoch": 0.8139023317201936, "eval_loss": 0.5028063654899597, "eval_runtime": 28.5662, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 925 }, { "epoch": 0.818301803783546, "grad_norm": 0.18352611362934113, "learning_rate": 1.9388370951546432e-05, "loss": 0.4733, "step": 930 }, { "epoch": 0.818301803783546, "eval_loss": 0.5027296543121338, "eval_runtime": 28.5532, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 930 }, { "epoch": 0.8227012758468983, "grad_norm": 0.18161964416503906, "learning_rate": 1.848840757112019e-05, "loss": 0.4556, "step": 935 }, { "epoch": 0.8227012758468983, "eval_loss": 0.5025849342346191, "eval_runtime": 28.6672, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 935 }, { "epoch": 0.8271007479102508, "grad_norm": 0.19485127925872803, "learning_rate": 1.7607699424244585e-05, "loss": 0.4973, "step": 940 }, { "epoch": 0.8271007479102508, "eval_loss": 0.5023777484893799, "eval_runtime": 28.5856, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 940 }, { "epoch": 0.8315002199736031, "grad_norm": 0.19218072295188904, "learning_rate": 1.674645455790468e-05, "loss": 0.4708, "step": 945 }, { "epoch": 0.8315002199736031, "eval_loss": 0.5024308562278748, "eval_runtime": 28.6001, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 945 }, { "epoch": 0.8358996920369556, "grad_norm": 0.18270643055438995, "learning_rate": 1.5904876421334536e-05, "loss": 0.4547, "step": 950 }, { "epoch": 0.8358996920369556, "eval_loss": 0.5024178624153137, "eval_runtime": 28.5464, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 950 }, { "epoch": 0.8402991641003079, "grad_norm": 0.18350371718406677, "learning_rate": 1.5083163817956914e-05, "loss": 0.4663, "step": 955 }, { "epoch": 0.8402991641003079, "eval_loss": 0.5021481513977051, "eval_runtime": 28.5783, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 955 }, { "epoch": 0.8446986361636604, "grad_norm": 0.18115630745887756, "learning_rate": 1.4281510858420632e-05, "loss": 0.4857, "step": 960 }, { "epoch": 0.8446986361636604, "eval_loss": 0.5019457340240479, "eval_runtime": 28.5976, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 960 }, { "epoch": 0.8490981082270127, "grad_norm": 0.1744571477174759, "learning_rate": 1.350010691474629e-05, "loss": 0.4633, "step": 965 }, { "epoch": 0.8490981082270127, "eval_loss": 0.5019629597663879, "eval_runtime": 28.5207, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 965 }, { "epoch": 0.8534975802903652, "grad_norm": 0.18827442824840546, "learning_rate": 1.2739136575591581e-05, "loss": 0.4723, "step": 970 }, { "epoch": 0.8534975802903652, "eval_loss": 0.5018792748451233, "eval_runtime": 28.4515, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.316, "step": 970 }, { "epoch": 0.8578970523537176, "grad_norm": 0.18166576325893402, "learning_rate": 1.1998779602646437e-05, "loss": 0.4691, "step": 975 }, { "epoch": 0.8578970523537176, "eval_loss": 0.5017500519752502, "eval_runtime": 28.5978, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 975 }, { "epoch": 0.8622965244170699, "grad_norm": 0.18091408908367157, "learning_rate": 1.1279210888168546e-05, "loss": 0.4874, "step": 980 }, { "epoch": 0.8622965244170699, "eval_loss": 0.5017052888870239, "eval_runtime": 28.7541, "eval_samples_per_second": 0.591, "eval_steps_per_second": 0.313, "step": 980 }, { "epoch": 0.8666959964804224, "grad_norm": 0.182442307472229, "learning_rate": 1.0580600413668984e-05, "loss": 0.4773, "step": 985 }, { "epoch": 0.8666959964804224, "eval_loss": 0.5016083121299744, "eval_runtime": 28.5972, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 985 }, { "epoch": 0.8710954685437747, "grad_norm": 0.18171900510787964, "learning_rate": 9.903113209758096e-06, "loss": 0.4806, "step": 990 }, { "epoch": 0.8710954685437747, "eval_loss": 0.5015130043029785, "eval_runtime": 28.5707, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 990 }, { "epoch": 0.8754949406071272, "grad_norm": 0.1896371841430664, "learning_rate": 9.246909317160746e-06, "loss": 0.4512, "step": 995 }, { "epoch": 0.8754949406071272, "eval_loss": 0.5013110637664795, "eval_runtime": 28.6509, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.314, "step": 995 }, { "epoch": 0.8798944126704795, "grad_norm": 0.1779976189136505, "learning_rate": 8.612143748910451e-06, "loss": 0.4561, "step": 1000 }, { "epoch": 0.8798944126704795, "eval_loss": 0.5013135075569153, "eval_runtime": 28.8047, "eval_samples_per_second": 0.59, "eval_steps_per_second": 0.312, "step": 1000 }, { "epoch": 0.884293884733832, "grad_norm": 0.17416957020759583, "learning_rate": 7.998966453731094e-06, "loss": 0.4637, "step": 1005 }, { "epoch": 0.884293884733832, "eval_loss": 0.5013565421104431, "eval_runtime": 28.5911, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1005 }, { "epoch": 0.8886933567971843, "grad_norm": 0.1769402176141739, "learning_rate": 7.40752228061502e-06, "loss": 0.4527, "step": 1010 }, { "epoch": 0.8886933567971843, "eval_loss": 0.5010828375816345, "eval_runtime": 28.5203, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 1010 }, { "epoch": 0.8930928288605368, "grad_norm": 0.17784808576107025, "learning_rate": 6.8379509446057644e-06, "loss": 0.4903, "step": 1015 }, { "epoch": 0.8930928288605368, "eval_loss": 0.5012202262878418, "eval_runtime": 27.8441, "eval_samples_per_second": 0.611, "eval_steps_per_second": 0.323, "step": 1015 }, { "epoch": 0.8974923009238891, "grad_norm": 0.18067394196987152, "learning_rate": 6.290386993793618e-06, "loss": 0.4689, "step": 1020 }, { "epoch": 0.8974923009238891, "eval_loss": 0.5012267231941223, "eval_runtime": 28.517, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 1020 }, { "epoch": 0.9018917729872415, "grad_norm": 0.17478391528129578, "learning_rate": 5.764959777531776e-06, "loss": 0.4589, "step": 1025 }, { "epoch": 0.9018917729872415, "eval_loss": 0.5011836290359497, "eval_runtime": 28.6023, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1025 }, { "epoch": 0.9062912450505939, "grad_norm": 0.185857892036438, "learning_rate": 5.261793415880456e-06, "loss": 0.4528, "step": 1030 }, { "epoch": 0.9062912450505939, "eval_loss": 0.501183807849884, "eval_runtime": 28.5159, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.316, "step": 1030 }, { "epoch": 0.9106907171139463, "grad_norm": 0.17951223254203796, "learning_rate": 4.781006770286478e-06, "loss": 0.4845, "step": 1035 }, { "epoch": 0.9106907171139463, "eval_loss": 0.5011433959007263, "eval_runtime": 28.6072, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1035 }, { "epoch": 0.9150901891772987, "grad_norm": 0.18096089363098145, "learning_rate": 4.322713415504975e-06, "loss": 0.4578, "step": 1040 }, { "epoch": 0.9150901891772987, "eval_loss": 0.5011703968048096, "eval_runtime": 28.6287, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 1040 }, { "epoch": 0.9194896612406511, "grad_norm": 0.2069099247455597, "learning_rate": 3.887021612769936e-06, "loss": 0.5027, "step": 1045 }, { "epoch": 0.9194896612406511, "eval_loss": 0.5011240839958191, "eval_runtime": 29.0514, "eval_samples_per_second": 0.585, "eval_steps_per_second": 0.31, "step": 1045 }, { "epoch": 0.9238891333040036, "grad_norm": 0.18762987852096558, "learning_rate": 3.4740342842199956e-06, "loss": 0.4695, "step": 1050 }, { "epoch": 0.9238891333040036, "eval_loss": 0.5010772347450256, "eval_runtime": 28.5655, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1050 }, { "epoch": 0.9282886053673559, "grad_norm": 0.178373321890831, "learning_rate": 3.0838489885854805e-06, "loss": 0.484, "step": 1055 }, { "epoch": 0.9282886053673559, "eval_loss": 0.5010451674461365, "eval_runtime": 28.6083, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1055 }, { "epoch": 0.9326880774307084, "grad_norm": 0.1794215440750122, "learning_rate": 2.7165578981424357e-06, "loss": 0.4784, "step": 1060 }, { "epoch": 0.9326880774307084, "eval_loss": 0.5010905265808105, "eval_runtime": 28.5675, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1060 }, { "epoch": 0.9370875494940607, "grad_norm": 0.17699354887008667, "learning_rate": 2.3722477769389517e-06, "loss": 0.4698, "step": 1065 }, { "epoch": 0.9370875494940607, "eval_loss": 0.5010352730751038, "eval_runtime": 28.6041, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1065 }, { "epoch": 0.9414870215574132, "grad_norm": 0.17208220064640045, "learning_rate": 2.0509999602992493e-06, "loss": 0.4517, "step": 1070 }, { "epoch": 0.9414870215574132, "eval_loss": 0.5010344982147217, "eval_runtime": 28.5865, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1070 }, { "epoch": 0.9458864936207655, "grad_norm": 0.1774464249610901, "learning_rate": 1.7528903356100469e-06, "loss": 0.4846, "step": 1075 }, { "epoch": 0.9458864936207655, "eval_loss": 0.5010223388671875, "eval_runtime": 28.5634, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1075 }, { "epoch": 0.9502859656841179, "grad_norm": 0.1773741990327835, "learning_rate": 1.4779893243939359e-06, "loss": 0.4402, "step": 1080 }, { "epoch": 0.9502859656841179, "eval_loss": 0.5009992718696594, "eval_runtime": 28.5952, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1080 }, { "epoch": 0.9546854377474703, "grad_norm": 0.18979211151599884, "learning_rate": 1.2263618656739084e-06, "loss": 0.5013, "step": 1085 }, { "epoch": 0.9546854377474703, "eval_loss": 0.501004159450531, "eval_runtime": 28.614, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1085 }, { "epoch": 0.9590849098108227, "grad_norm": 0.1895236372947693, "learning_rate": 9.98067400632985e-07, "loss": 0.4588, "step": 1090 }, { "epoch": 0.9590849098108227, "eval_loss": 0.5009981393814087, "eval_runtime": 28.5601, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1090 }, { "epoch": 0.9634843818741751, "grad_norm": 0.17328618466854095, "learning_rate": 7.931598585726563e-07, "loss": 0.4712, "step": 1095 }, { "epoch": 0.9634843818741751, "eval_loss": 0.500961184501648, "eval_runtime": 28.574, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1095 }, { "epoch": 0.9678838539375275, "grad_norm": 0.18122579157352448, "learning_rate": 6.116876441733088e-07, "loss": 0.4534, "step": 1100 }, { "epoch": 0.9678838539375275, "eval_loss": 0.5009814500808716, "eval_runtime": 28.5934, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1100 }, { "epoch": 0.9722833260008799, "grad_norm": 0.18148748576641083, "learning_rate": 4.536936260597258e-07, "loss": 0.4587, "step": 1105 }, { "epoch": 0.9722833260008799, "eval_loss": 0.5009997487068176, "eval_runtime": 28.5275, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 1105 }, { "epoch": 0.9766827980642323, "grad_norm": 0.18024764955043793, "learning_rate": 3.192151266743548e-07, "loss": 0.4783, "step": 1110 }, { "epoch": 0.9766827980642323, "eval_loss": 0.5009670853614807, "eval_runtime": 28.5688, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.315, "step": 1110 }, { "epoch": 0.9810822701275846, "grad_norm": 0.18152055144309998, "learning_rate": 2.082839134607828e-07, "loss": 0.4623, "step": 1115 }, { "epoch": 0.9810822701275846, "eval_loss": 0.5009202361106873, "eval_runtime": 28.6066, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1115 }, { "epoch": 0.9854817421909371, "grad_norm": 0.17324087023735046, "learning_rate": 1.2092619135937177e-07, "loss": 0.439, "step": 1120 }, { "epoch": 0.9854817421909371, "eval_loss": 0.5010377168655396, "eval_runtime": 28.5308, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.315, "step": 1120 }, { "epoch": 0.9898812142542894, "grad_norm": 0.17685554921627045, "learning_rate": 5.716259661695533e-08, "loss": 0.4629, "step": 1125 }, { "epoch": 0.9898812142542894, "eval_loss": 0.5009082555770874, "eval_runtime": 28.6259, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 1125 }, { "epoch": 0.9942806863176419, "grad_norm": 0.17675389349460602, "learning_rate": 1.7008191912004646e-08, "loss": 0.4716, "step": 1130 }, { "epoch": 0.9942806863176419, "eval_loss": 0.5009535551071167, "eval_runtime": 28.626, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.314, "step": 1130 }, { "epoch": 0.9986801583809943, "grad_norm": 0.18398317694664001, "learning_rate": 4.724627964303175e-10, "loss": 0.4832, "step": 1135 }, { "epoch": 0.9986801583809943, "eval_loss": 0.5010104179382324, "eval_runtime": 28.6106, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.315, "step": 1135 }, { "epoch": 0.9995600527936648, "step": 1136, "total_flos": 7.211600370336793e+18, "train_loss": 0.039691918463984004, "train_runtime": 9596.3839, "train_samples_per_second": 1.895, "train_steps_per_second": 0.118 } ], "logging_steps": 5, "max_steps": 1136, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.211600370336793e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }