diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5675 @@ +{ + "best_metric": 0.035647086799144745, + "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold4/checkpoint-1300", + "epoch": 9.999045892567503, + "eval_steps": 50, + "global_step": 6550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015265718919950386, + "grad_norm": 4.185297966003418, + "learning_rate": 1.5267175572519084e-06, + "loss": 1.6301, + "step": 10 + }, + { + "epoch": 0.030531437839900772, + "grad_norm": 4.850170612335205, + "learning_rate": 3.053435114503817e-06, + "loss": 1.5804, + "step": 20 + }, + { + "epoch": 0.04579715675985116, + "grad_norm": 4.640727519989014, + "learning_rate": 4.580152671755725e-06, + "loss": 1.5339, + "step": 30 + }, + { + "epoch": 0.061062875679801544, + "grad_norm": 6.06700325012207, + "learning_rate": 6.106870229007634e-06, + "loss": 1.451, + "step": 40 + }, + { + "epoch": 0.07632859459975193, + "grad_norm": 2.3383665084838867, + "learning_rate": 7.633587786259543e-06, + "loss": 0.916, + "step": 50 + }, + { + "epoch": 0.07632859459975193, + "eval_loss": 0.6881796717643738, + "eval_runtime": 159.2043, + "eval_samples_per_second": 7.318, + "eval_steps_per_second": 7.318, + "step": 50 + }, + { + "epoch": 0.09159431351970232, + "grad_norm": 2.3201258182525635, + "learning_rate": 9.16030534351145e-06, + "loss": 0.6855, + "step": 60 + }, + { + "epoch": 0.10686003243965271, + "grad_norm": 1.6580810546875, + "learning_rate": 1.0687022900763359e-05, + "loss": 0.5476, + "step": 70 + }, + { + "epoch": 0.12212575135960309, + "grad_norm": 1.4332237243652344, + "learning_rate": 1.2213740458015267e-05, + "loss": 0.3756, + "step": 80 + }, + { + "epoch": 0.13739147027955348, + "grad_norm": 1.2752610445022583, + "learning_rate": 1.3740458015267178e-05, + "loss": 0.268, + "step": 90 + }, + { + "epoch": 0.15265718919950386, + "grad_norm": 0.9856141805648804, + "learning_rate": 1.5267175572519086e-05, + "loss": 0.2175, + "step": 100 + }, + { + "epoch": 0.15265718919950386, + "eval_loss": 0.13453072309494019, + "eval_runtime": 159.257, + "eval_samples_per_second": 7.315, + "eval_steps_per_second": 7.315, + "step": 100 + }, + { + "epoch": 0.16792290811945426, + "grad_norm": 1.4137542247772217, + "learning_rate": 1.6793893129770993e-05, + "loss": 0.1411, + "step": 110 + }, + { + "epoch": 0.18318862703940464, + "grad_norm": 0.8845463395118713, + "learning_rate": 1.83206106870229e-05, + "loss": 0.1143, + "step": 120 + }, + { + "epoch": 0.19845434595935502, + "grad_norm": 0.8023373484611511, + "learning_rate": 1.984732824427481e-05, + "loss": 0.117, + "step": 130 + }, + { + "epoch": 0.21372006487930542, + "grad_norm": 0.9943445920944214, + "learning_rate": 2.1374045801526718e-05, + "loss": 0.0949, + "step": 140 + }, + { + "epoch": 0.2289857837992558, + "grad_norm": 0.6204259395599365, + "learning_rate": 2.2900763358778628e-05, + "loss": 0.0687, + "step": 150 + }, + { + "epoch": 0.2289857837992558, + "eval_loss": 0.07771342992782593, + "eval_runtime": 159.3376, + "eval_samples_per_second": 7.312, + "eval_steps_per_second": 7.312, + "step": 150 + }, + { + "epoch": 0.24425150271920618, + "grad_norm": 0.70217365026474, + "learning_rate": 2.4427480916030535e-05, + "loss": 0.081, + "step": 160 + }, + { + "epoch": 0.2595172216391566, + "grad_norm": 1.117322325706482, + "learning_rate": 2.5954198473282442e-05, + "loss": 0.0877, + "step": 170 + }, + { + "epoch": 0.27478294055910696, + "grad_norm": 0.651593804359436, + "learning_rate": 2.7480916030534355e-05, + "loss": 0.0863, + "step": 180 + }, + { + "epoch": 0.29004865947905734, + "grad_norm": 0.8036761283874512, + "learning_rate": 2.900763358778626e-05, + "loss": 0.0739, + "step": 190 + }, + { + "epoch": 0.3053143783990077, + "grad_norm": 0.6328158378601074, + "learning_rate": 3.053435114503817e-05, + "loss": 0.0541, + "step": 200 + }, + { + "epoch": 0.3053143783990077, + "eval_loss": 0.05991174280643463, + "eval_runtime": 159.312, + "eval_samples_per_second": 7.313, + "eval_steps_per_second": 7.313, + "step": 200 + }, + { + "epoch": 0.3205800973189581, + "grad_norm": 1.0223695039749146, + "learning_rate": 3.2061068702290076e-05, + "loss": 0.0739, + "step": 210 + }, + { + "epoch": 0.3358458162389085, + "grad_norm": 0.7223497629165649, + "learning_rate": 3.358778625954199e-05, + "loss": 0.0588, + "step": 220 + }, + { + "epoch": 0.3511115351588589, + "grad_norm": 0.9775506854057312, + "learning_rate": 3.511450381679389e-05, + "loss": 0.0509, + "step": 230 + }, + { + "epoch": 0.3663772540788093, + "grad_norm": 0.7470011115074158, + "learning_rate": 3.66412213740458e-05, + "loss": 0.0659, + "step": 240 + }, + { + "epoch": 0.38164297299875966, + "grad_norm": 0.8122346997261047, + "learning_rate": 3.816793893129771e-05, + "loss": 0.0681, + "step": 250 + }, + { + "epoch": 0.38164297299875966, + "eval_loss": 0.05324295908212662, + "eval_runtime": 159.4295, + "eval_samples_per_second": 7.307, + "eval_steps_per_second": 7.307, + "step": 250 + }, + { + "epoch": 0.39690869191871003, + "grad_norm": 1.0022155046463013, + "learning_rate": 3.969465648854962e-05, + "loss": 0.0606, + "step": 260 + }, + { + "epoch": 0.4121744108386604, + "grad_norm": 0.7752587795257568, + "learning_rate": 4.122137404580153e-05, + "loss": 0.06, + "step": 270 + }, + { + "epoch": 0.42744012975861084, + "grad_norm": 0.5192140340805054, + "learning_rate": 4.2748091603053435e-05, + "loss": 0.0626, + "step": 280 + }, + { + "epoch": 0.4427058486785612, + "grad_norm": 0.7159129977226257, + "learning_rate": 4.4274809160305345e-05, + "loss": 0.0618, + "step": 290 + }, + { + "epoch": 0.4579715675985116, + "grad_norm": 0.796240508556366, + "learning_rate": 4.5801526717557256e-05, + "loss": 0.0534, + "step": 300 + }, + { + "epoch": 0.4579715675985116, + "eval_loss": 0.0524621345102787, + "eval_runtime": 159.4071, + "eval_samples_per_second": 7.308, + "eval_steps_per_second": 7.308, + "step": 300 + }, + { + "epoch": 0.473237286518462, + "grad_norm": 0.7834715247154236, + "learning_rate": 4.7328244274809166e-05, + "loss": 0.0671, + "step": 310 + }, + { + "epoch": 0.48850300543841235, + "grad_norm": 0.4522975981235504, + "learning_rate": 4.885496183206107e-05, + "loss": 0.0432, + "step": 320 + }, + { + "epoch": 0.5037687243583627, + "grad_norm": 0.8254855871200562, + "learning_rate": 5.038167938931297e-05, + "loss": 0.0532, + "step": 330 + }, + { + "epoch": 0.5190344432783132, + "grad_norm": 1.0818955898284912, + "learning_rate": 5.1908396946564884e-05, + "loss": 0.0462, + "step": 340 + }, + { + "epoch": 0.5343001621982635, + "grad_norm": 0.8718852400779724, + "learning_rate": 5.3435114503816794e-05, + "loss": 0.0503, + "step": 350 + }, + { + "epoch": 0.5343001621982635, + "eval_loss": 0.056452371180057526, + "eval_runtime": 159.4977, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 350 + }, + { + "epoch": 0.5495658811182139, + "grad_norm": 0.5943936109542847, + "learning_rate": 5.496183206106871e-05, + "loss": 0.0475, + "step": 360 + }, + { + "epoch": 0.5648316000381643, + "grad_norm": 0.3838348090648651, + "learning_rate": 5.648854961832062e-05, + "loss": 0.06, + "step": 370 + }, + { + "epoch": 0.5800973189581147, + "grad_norm": 1.5746500492095947, + "learning_rate": 5.801526717557252e-05, + "loss": 0.0647, + "step": 380 + }, + { + "epoch": 0.5953630378780651, + "grad_norm": 0.5839850306510925, + "learning_rate": 5.954198473282443e-05, + "loss": 0.0592, + "step": 390 + }, + { + "epoch": 0.6106287567980154, + "grad_norm": 0.8526315093040466, + "learning_rate": 6.106870229007635e-05, + "loss": 0.0505, + "step": 400 + }, + { + "epoch": 0.6106287567980154, + "eval_loss": 0.0519135408103466, + "eval_runtime": 159.458, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 400 + }, + { + "epoch": 0.6258944757179659, + "grad_norm": 0.852070152759552, + "learning_rate": 6.259541984732826e-05, + "loss": 0.0548, + "step": 410 + }, + { + "epoch": 0.6411601946379162, + "grad_norm": 0.5597646236419678, + "learning_rate": 6.412213740458015e-05, + "loss": 0.056, + "step": 420 + }, + { + "epoch": 0.6564259135578666, + "grad_norm": 0.7913482189178467, + "learning_rate": 6.564885496183206e-05, + "loss": 0.0418, + "step": 430 + }, + { + "epoch": 0.671691632477817, + "grad_norm": 0.364639014005661, + "learning_rate": 6.717557251908397e-05, + "loss": 0.0542, + "step": 440 + }, + { + "epoch": 0.6869573513977674, + "grad_norm": 0.4461608827114105, + "learning_rate": 6.870229007633588e-05, + "loss": 0.0641, + "step": 450 + }, + { + "epoch": 0.6869573513977674, + "eval_loss": 0.05470795929431915, + "eval_runtime": 159.4655, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 450 + }, + { + "epoch": 0.7022230703177178, + "grad_norm": 0.4279601573944092, + "learning_rate": 7.022900763358778e-05, + "loss": 0.048, + "step": 460 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.3910561501979828, + "learning_rate": 7.175572519083969e-05, + "loss": 0.0424, + "step": 470 + }, + { + "epoch": 0.7327545081576186, + "grad_norm": 1.3399211168289185, + "learning_rate": 7.32824427480916e-05, + "loss": 0.051, + "step": 480 + }, + { + "epoch": 0.748020227077569, + "grad_norm": 0.4181521534919739, + "learning_rate": 7.480916030534351e-05, + "loss": 0.04, + "step": 490 + }, + { + "epoch": 0.7632859459975193, + "grad_norm": 0.23952007293701172, + "learning_rate": 7.633587786259542e-05, + "loss": 0.038, + "step": 500 + }, + { + "epoch": 0.7632859459975193, + "eval_loss": 0.04136418551206589, + "eval_runtime": 159.4948, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 500 + }, + { + "epoch": 0.7785516649174697, + "grad_norm": 0.49341875314712524, + "learning_rate": 7.786259541984733e-05, + "loss": 0.05, + "step": 510 + }, + { + "epoch": 0.7938173838374201, + "grad_norm": 0.5933352112770081, + "learning_rate": 7.938931297709924e-05, + "loss": 0.0546, + "step": 520 + }, + { + "epoch": 0.8090831027573705, + "grad_norm": 0.6072858572006226, + "learning_rate": 8.091603053435115e-05, + "loss": 0.0401, + "step": 530 + }, + { + "epoch": 0.8243488216773208, + "grad_norm": 0.940795361995697, + "learning_rate": 8.244274809160306e-05, + "loss": 0.052, + "step": 540 + }, + { + "epoch": 0.8396145405972713, + "grad_norm": 0.6024621725082397, + "learning_rate": 8.396946564885496e-05, + "loss": 0.0491, + "step": 550 + }, + { + "epoch": 0.8396145405972713, + "eval_loss": 0.04395580291748047, + "eval_runtime": 159.46, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 550 + }, + { + "epoch": 0.8548802595172217, + "grad_norm": 0.350858211517334, + "learning_rate": 8.549618320610687e-05, + "loss": 0.0515, + "step": 560 + }, + { + "epoch": 0.870145978437172, + "grad_norm": 0.406339168548584, + "learning_rate": 8.702290076335878e-05, + "loss": 0.0458, + "step": 570 + }, + { + "epoch": 0.8854116973571224, + "grad_norm": 1.2356770038604736, + "learning_rate": 8.854961832061069e-05, + "loss": 0.0628, + "step": 580 + }, + { + "epoch": 0.9006774162770728, + "grad_norm": 0.5165905952453613, + "learning_rate": 9.007633587786259e-05, + "loss": 0.0549, + "step": 590 + }, + { + "epoch": 0.9159431351970232, + "grad_norm": 0.5175654292106628, + "learning_rate": 9.160305343511451e-05, + "loss": 0.048, + "step": 600 + }, + { + "epoch": 0.9159431351970232, + "eval_loss": 0.041592296212911606, + "eval_runtime": 159.4026, + "eval_samples_per_second": 7.309, + "eval_steps_per_second": 7.309, + "step": 600 + }, + { + "epoch": 0.9312088541169735, + "grad_norm": 0.5316981673240662, + "learning_rate": 9.312977099236642e-05, + "loss": 0.0654, + "step": 610 + }, + { + "epoch": 0.946474573036924, + "grad_norm": 0.4219525456428528, + "learning_rate": 9.465648854961833e-05, + "loss": 0.0527, + "step": 620 + }, + { + "epoch": 0.9617402919568744, + "grad_norm": 0.4892270863056183, + "learning_rate": 9.618320610687024e-05, + "loss": 0.0364, + "step": 630 + }, + { + "epoch": 0.9770060108768247, + "grad_norm": 0.4207621216773987, + "learning_rate": 9.770992366412214e-05, + "loss": 0.0533, + "step": 640 + }, + { + "epoch": 0.9922717297967751, + "grad_norm": 0.37451624870300293, + "learning_rate": 9.923664122137405e-05, + "loss": 0.0394, + "step": 650 + }, + { + "epoch": 0.9922717297967751, + "eval_loss": 0.04279087483882904, + "eval_runtime": 159.443, + "eval_samples_per_second": 7.307, + "eval_steps_per_second": 7.307, + "step": 650 + }, + { + "epoch": 1.0075374487167255, + "grad_norm": 0.34824955463409424, + "learning_rate": 9.999982249447028e-05, + "loss": 0.0346, + "step": 660 + }, + { + "epoch": 1.0228031676366758, + "grad_norm": 0.30995848774909973, + "learning_rate": 9.999840245779451e-05, + "loss": 0.0373, + "step": 670 + }, + { + "epoch": 1.0380688865566263, + "grad_norm": 0.6333316564559937, + "learning_rate": 9.999556242477317e-05, + "loss": 0.0442, + "step": 680 + }, + { + "epoch": 1.0533346054765766, + "grad_norm": 0.27216362953186035, + "learning_rate": 9.999130247606558e-05, + "loss": 0.0298, + "step": 690 + }, + { + "epoch": 1.068600324396527, + "grad_norm": 0.3640441596508026, + "learning_rate": 9.998562273265785e-05, + "loss": 0.04, + "step": 700 + }, + { + "epoch": 1.068600324396527, + "eval_loss": 0.03989888355135918, + "eval_runtime": 159.4012, + "eval_samples_per_second": 7.309, + "eval_steps_per_second": 7.309, + "step": 700 + }, + { + "epoch": 1.0838660433164775, + "grad_norm": 0.4131262004375458, + "learning_rate": 9.99785233558594e-05, + "loss": 0.0466, + "step": 710 + }, + { + "epoch": 1.0991317622364278, + "grad_norm": 0.24598121643066406, + "learning_rate": 9.99700045472985e-05, + "loss": 0.0296, + "step": 720 + }, + { + "epoch": 1.1143974811563782, + "grad_norm": 0.36616966128349304, + "learning_rate": 9.996006654891639e-05, + "loss": 0.0345, + "step": 730 + }, + { + "epoch": 1.1296632000763287, + "grad_norm": 0.34495922923088074, + "learning_rate": 9.994870964296052e-05, + "loss": 0.0416, + "step": 740 + }, + { + "epoch": 1.144928918996279, + "grad_norm": 0.35510513186454773, + "learning_rate": 9.99359341519765e-05, + "loss": 0.041, + "step": 750 + }, + { + "epoch": 1.144928918996279, + "eval_loss": 0.039214372634887695, + "eval_runtime": 159.358, + "eval_samples_per_second": 7.311, + "eval_steps_per_second": 7.311, + "step": 750 + }, + { + "epoch": 1.1601946379162293, + "grad_norm": 0.2958844006061554, + "learning_rate": 9.992174043879892e-05, + "loss": 0.033, + "step": 760 + }, + { + "epoch": 1.1754603568361797, + "grad_norm": 0.26520371437072754, + "learning_rate": 9.99061289065411e-05, + "loss": 0.0365, + "step": 770 + }, + { + "epoch": 1.1907260757561302, + "grad_norm": 0.2883516550064087, + "learning_rate": 9.988909999858356e-05, + "loss": 0.0387, + "step": 780 + }, + { + "epoch": 1.2059917946760805, + "grad_norm": 0.7589551210403442, + "learning_rate": 9.98706541985615e-05, + "loss": 0.0338, + "step": 790 + }, + { + "epoch": 1.2212575135960309, + "grad_norm": 0.33063894510269165, + "learning_rate": 9.985079203035103e-05, + "loss": 0.0349, + "step": 800 + }, + { + "epoch": 1.2212575135960309, + "eval_loss": 0.04556870833039284, + "eval_runtime": 159.3716, + "eval_samples_per_second": 7.31, + "eval_steps_per_second": 7.31, + "step": 800 + }, + { + "epoch": 1.2365232325159814, + "grad_norm": 0.35318559408187866, + "learning_rate": 9.98295140580543e-05, + "loss": 0.0417, + "step": 810 + }, + { + "epoch": 1.2517889514359317, + "grad_norm": 0.42117926478385925, + "learning_rate": 9.980682088598348e-05, + "loss": 0.0387, + "step": 820 + }, + { + "epoch": 1.267054670355882, + "grad_norm": 0.4907573163509369, + "learning_rate": 9.97827131586436e-05, + "loss": 0.0458, + "step": 830 + }, + { + "epoch": 1.2823203892758324, + "grad_norm": 0.25423258543014526, + "learning_rate": 9.975719156071422e-05, + "loss": 0.0398, + "step": 840 + }, + { + "epoch": 1.297586108195783, + "grad_norm": 0.2559823989868164, + "learning_rate": 9.973025681702999e-05, + "loss": 0.0351, + "step": 850 + }, + { + "epoch": 1.297586108195783, + "eval_loss": 0.04090137407183647, + "eval_runtime": 159.459, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 850 + }, + { + "epoch": 1.3128518271157332, + "grad_norm": 0.5977096557617188, + "learning_rate": 9.970190969256014e-05, + "loss": 0.0449, + "step": 860 + }, + { + "epoch": 1.3281175460356835, + "grad_norm": 0.7555682063102722, + "learning_rate": 9.967215099238665e-05, + "loss": 0.0471, + "step": 870 + }, + { + "epoch": 1.343383264955634, + "grad_norm": 0.8137542605400085, + "learning_rate": 9.964098156168142e-05, + "loss": 0.0377, + "step": 880 + }, + { + "epoch": 1.3586489838755844, + "grad_norm": 1.471318006515503, + "learning_rate": 9.960840228568232e-05, + "loss": 0.0403, + "step": 890 + }, + { + "epoch": 1.3739147027955347, + "grad_norm": 0.3770105242729187, + "learning_rate": 9.957441408966792e-05, + "loss": 0.0372, + "step": 900 + }, + { + "epoch": 1.3739147027955347, + "eval_loss": 0.043522100895643234, + "eval_runtime": 159.451, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 900 + }, + { + "epoch": 1.3891804217154853, + "grad_norm": 0.4724035859107971, + "learning_rate": 9.953901793893137e-05, + "loss": 0.0307, + "step": 910 + }, + { + "epoch": 1.4044461406354356, + "grad_norm": 0.3052575886249542, + "learning_rate": 9.950221483875288e-05, + "loss": 0.0359, + "step": 920 + }, + { + "epoch": 1.419711859555386, + "grad_norm": 0.2783440053462982, + "learning_rate": 9.946400583437122e-05, + "loss": 0.0274, + "step": 930 + }, + { + "epoch": 1.4349775784753362, + "grad_norm": 0.4454845190048218, + "learning_rate": 9.942439201095397e-05, + "loss": 0.0394, + "step": 940 + }, + { + "epoch": 1.4502432973952868, + "grad_norm": 0.2844729721546173, + "learning_rate": 9.938337449356678e-05, + "loss": 0.0349, + "step": 950 + }, + { + "epoch": 1.4502432973952868, + "eval_loss": 0.03776552900671959, + "eval_runtime": 159.3885, + "eval_samples_per_second": 7.309, + "eval_steps_per_second": 7.309, + "step": 950 + }, + { + "epoch": 1.4655090163152371, + "grad_norm": 0.26744791865348816, + "learning_rate": 9.934095444714136e-05, + "loss": 0.0363, + "step": 960 + }, + { + "epoch": 1.4807747352351874, + "grad_norm": 0.36011719703674316, + "learning_rate": 9.929713307644244e-05, + "loss": 0.0381, + "step": 970 + }, + { + "epoch": 1.4960404541551378, + "grad_norm": 0.23996131122112274, + "learning_rate": 9.925191162603347e-05, + "loss": 0.0296, + "step": 980 + }, + { + "epoch": 1.5113061730750883, + "grad_norm": 0.3133530020713806, + "learning_rate": 9.920529138024141e-05, + "loss": 0.0401, + "step": 990 + }, + { + "epoch": 1.5265718919950386, + "grad_norm": 0.173675537109375, + "learning_rate": 9.91572736631201e-05, + "loss": 0.0289, + "step": 1000 + }, + { + "epoch": 1.5265718919950386, + "eval_loss": 0.03918570280075073, + "eval_runtime": 159.4919, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 1000 + }, + { + "epoch": 1.541837610914989, + "grad_norm": 0.2571834623813629, + "learning_rate": 9.910785983841282e-05, + "loss": 0.0381, + "step": 1010 + }, + { + "epoch": 1.5571033298349395, + "grad_norm": 0.4605151414871216, + "learning_rate": 9.905705130951338e-05, + "loss": 0.0321, + "step": 1020 + }, + { + "epoch": 1.5723690487548898, + "grad_norm": 0.2051086127758026, + "learning_rate": 9.90048495194264e-05, + "loss": 0.0325, + "step": 1030 + }, + { + "epoch": 1.5876347676748401, + "grad_norm": 0.2816246449947357, + "learning_rate": 9.895125595072628e-05, + "loss": 0.0311, + "step": 1040 + }, + { + "epoch": 1.6029004865947907, + "grad_norm": 0.16479884088039398, + "learning_rate": 9.889627212551508e-05, + "loss": 0.0329, + "step": 1050 + }, + { + "epoch": 1.6029004865947907, + "eval_loss": 0.039624087512493134, + "eval_runtime": 159.4799, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 1050 + }, + { + "epoch": 1.618166205514741, + "grad_norm": 0.13082267343997955, + "learning_rate": 9.883989960537933e-05, + "loss": 0.0299, + "step": 1060 + }, + { + "epoch": 1.6334319244346913, + "grad_norm": 0.5756223201751709, + "learning_rate": 9.878213999134561e-05, + "loss": 0.0463, + "step": 1070 + }, + { + "epoch": 1.6486976433546419, + "grad_norm": 0.3245750069618225, + "learning_rate": 9.872299492383517e-05, + "loss": 0.0327, + "step": 1080 + }, + { + "epoch": 1.663963362274592, + "grad_norm": 0.2791925370693207, + "learning_rate": 9.866246608261724e-05, + "loss": 0.0331, + "step": 1090 + }, + { + "epoch": 1.6792290811945425, + "grad_norm": 0.4042149782180786, + "learning_rate": 9.860055518676146e-05, + "loss": 0.0547, + "step": 1100 + }, + { + "epoch": 1.6792290811945425, + "eval_loss": 0.040536098182201385, + "eval_runtime": 159.3847, + "eval_samples_per_second": 7.309, + "eval_steps_per_second": 7.309, + "step": 1100 + }, + { + "epoch": 1.694494800114493, + "grad_norm": 0.29718685150146484, + "learning_rate": 9.85372639945889e-05, + "loss": 0.0451, + "step": 1110 + }, + { + "epoch": 1.7097605190344431, + "grad_norm": 0.23242589831352234, + "learning_rate": 9.847259430362223e-05, + "loss": 0.0292, + "step": 1120 + }, + { + "epoch": 1.7250262379543937, + "grad_norm": 0.29318344593048096, + "learning_rate": 9.840654795053462e-05, + "loss": 0.0288, + "step": 1130 + }, + { + "epoch": 1.740291956874344, + "grad_norm": 0.3213809132575989, + "learning_rate": 9.833912681109761e-05, + "loss": 0.0432, + "step": 1140 + }, + { + "epoch": 1.7555576757942943, + "grad_norm": 0.45590150356292725, + "learning_rate": 9.827033280012783e-05, + "loss": 0.0385, + "step": 1150 + }, + { + "epoch": 1.7555576757942943, + "eval_loss": 0.040681391954422, + "eval_runtime": 159.4595, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 1150 + }, + { + "epoch": 1.7708233947142449, + "grad_norm": 0.39289769530296326, + "learning_rate": 9.820016787143256e-05, + "loss": 0.0451, + "step": 1160 + }, + { + "epoch": 1.7860891136341952, + "grad_norm": 0.24000108242034912, + "learning_rate": 9.812863401775432e-05, + "loss": 0.0339, + "step": 1170 + }, + { + "epoch": 1.8013548325541455, + "grad_norm": 0.19592148065567017, + "learning_rate": 9.805573327071427e-05, + "loss": 0.0322, + "step": 1180 + }, + { + "epoch": 1.816620551474096, + "grad_norm": 0.20962874591350555, + "learning_rate": 9.798146770075441e-05, + "loss": 0.0373, + "step": 1190 + }, + { + "epoch": 1.8318862703940464, + "grad_norm": 0.4064479470252991, + "learning_rate": 9.790583941707892e-05, + "loss": 0.054, + "step": 1200 + }, + { + "epoch": 1.8318862703940464, + "eval_loss": 0.03855321183800697, + "eval_runtime": 159.4871, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 1200 + }, + { + "epoch": 1.8471519893139967, + "grad_norm": 0.27456554770469666, + "learning_rate": 9.782885056759413e-05, + "loss": 0.0416, + "step": 1210 + }, + { + "epoch": 1.8624177082339473, + "grad_norm": 0.46328073740005493, + "learning_rate": 9.775050333884763e-05, + "loss": 0.0322, + "step": 1220 + }, + { + "epoch": 1.8776834271538976, + "grad_norm": 0.08565231412649155, + "learning_rate": 9.767079995596606e-05, + "loss": 0.0345, + "step": 1230 + }, + { + "epoch": 1.892949146073848, + "grad_norm": 0.3863065540790558, + "learning_rate": 9.7589742682592e-05, + "loss": 0.037, + "step": 1240 + }, + { + "epoch": 1.9082148649937984, + "grad_norm": 0.18653155863285065, + "learning_rate": 9.750733382081965e-05, + "loss": 0.0311, + "step": 1250 + }, + { + "epoch": 1.9082148649937984, + "eval_loss": 0.03697221353650093, + "eval_runtime": 159.4283, + "eval_samples_per_second": 7.307, + "eval_steps_per_second": 7.307, + "step": 1250 + }, + { + "epoch": 1.9234805839137485, + "grad_norm": 0.15082551538944244, + "learning_rate": 9.742357571112944e-05, + "loss": 0.0317, + "step": 1260 + }, + { + "epoch": 1.938746302833699, + "grad_norm": 0.1693073809146881, + "learning_rate": 9.733847073232155e-05, + "loss": 0.0356, + "step": 1270 + }, + { + "epoch": 1.9540120217536494, + "grad_norm": 0.6657117009162903, + "learning_rate": 9.725202130144841e-05, + "loss": 0.0433, + "step": 1280 + }, + { + "epoch": 1.9692777406735997, + "grad_norm": 0.25211548805236816, + "learning_rate": 9.716422987374598e-05, + "loss": 0.0318, + "step": 1290 + }, + { + "epoch": 1.9845434595935503, + "grad_norm": 0.3978109359741211, + "learning_rate": 9.707509894256405e-05, + "loss": 0.0381, + "step": 1300 + }, + { + "epoch": 1.9845434595935503, + "eval_loss": 0.035647086799144745, + "eval_runtime": 159.4759, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 1300 + }, + { + "epoch": 1.9998091785135006, + "grad_norm": 0.11071023344993591, + "learning_rate": 9.698463103929542e-05, + "loss": 0.0398, + "step": 1310 + }, + { + "epoch": 2.015074897433451, + "grad_norm": 0.20861339569091797, + "learning_rate": 9.689282873330405e-05, + "loss": 0.0183, + "step": 1320 + }, + { + "epoch": 2.0303406163534015, + "grad_norm": 0.4546363949775696, + "learning_rate": 9.6799694631852e-05, + "loss": 0.0274, + "step": 1330 + }, + { + "epoch": 2.0456063352733516, + "grad_norm": 0.31461238861083984, + "learning_rate": 9.670523138002547e-05, + "loss": 0.027, + "step": 1340 + }, + { + "epoch": 2.060872054193302, + "grad_norm": 0.21481330692768097, + "learning_rate": 9.660944166065962e-05, + "loss": 0.0222, + "step": 1350 + }, + { + "epoch": 2.060872054193302, + "eval_loss": 0.03670891374349594, + "eval_runtime": 159.3699, + "eval_samples_per_second": 7.31, + "eval_steps_per_second": 7.31, + "step": 1350 + }, + { + "epoch": 2.0761377731132526, + "grad_norm": 0.21774673461914062, + "learning_rate": 9.651232819426242e-05, + "loss": 0.0191, + "step": 1360 + }, + { + "epoch": 2.0914034920332027, + "grad_norm": 0.3336104452610016, + "learning_rate": 9.64138937389373e-05, + "loss": 0.0249, + "step": 1370 + }, + { + "epoch": 2.1066692109531533, + "grad_norm": 0.3336142599582672, + "learning_rate": 9.631414109030497e-05, + "loss": 0.0279, + "step": 1380 + }, + { + "epoch": 2.121934929873104, + "grad_norm": 0.17093627154827118, + "learning_rate": 9.621307308142384e-05, + "loss": 0.0247, + "step": 1390 + }, + { + "epoch": 2.137200648793054, + "grad_norm": 0.27105674147605896, + "learning_rate": 9.61106925827097e-05, + "loss": 0.0259, + "step": 1400 + }, + { + "epoch": 2.137200648793054, + "eval_loss": 0.03786560148000717, + "eval_runtime": 159.4771, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 1400 + }, + { + "epoch": 2.1524663677130045, + "grad_norm": 0.3765684962272644, + "learning_rate": 9.600700250185414e-05, + "loss": 0.0369, + "step": 1410 + }, + { + "epoch": 2.167732086632955, + "grad_norm": 0.20686203241348267, + "learning_rate": 9.590200578374198e-05, + "loss": 0.0267, + "step": 1420 + }, + { + "epoch": 2.182997805552905, + "grad_norm": 0.18278922140598297, + "learning_rate": 9.579570541036757e-05, + "loss": 0.0229, + "step": 1430 + }, + { + "epoch": 2.1982635244728557, + "grad_norm": 0.28518304228782654, + "learning_rate": 9.568810440075026e-05, + "loss": 0.0225, + "step": 1440 + }, + { + "epoch": 2.213529243392806, + "grad_norm": 0.39306774735450745, + "learning_rate": 9.557920581084847e-05, + "loss": 0.0332, + "step": 1450 + }, + { + "epoch": 2.213529243392806, + "eval_loss": 0.03821547329425812, + "eval_runtime": 159.6056, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 1450 + }, + { + "epoch": 2.2287949623127563, + "grad_norm": 0.2619123160839081, + "learning_rate": 9.546901273347301e-05, + "loss": 0.0186, + "step": 1460 + }, + { + "epoch": 2.244060681232707, + "grad_norm": 0.37569043040275574, + "learning_rate": 9.535752829819926e-05, + "loss": 0.0256, + "step": 1470 + }, + { + "epoch": 2.2593264001526574, + "grad_norm": 0.20631712675094604, + "learning_rate": 9.524475567127813e-05, + "loss": 0.0216, + "step": 1480 + }, + { + "epoch": 2.2745921190726075, + "grad_norm": 0.24733610451221466, + "learning_rate": 9.513069805554636e-05, + "loss": 0.024, + "step": 1490 + }, + { + "epoch": 2.289857837992558, + "grad_norm": 0.18369323015213013, + "learning_rate": 9.501535869033537e-05, + "loss": 0.0186, + "step": 1500 + }, + { + "epoch": 2.289857837992558, + "eval_loss": 0.039878398180007935, + "eval_runtime": 159.5552, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 1500 + }, + { + "epoch": 2.305123556912508, + "grad_norm": 0.2366923838853836, + "learning_rate": 9.48987408513794e-05, + "loss": 0.0298, + "step": 1510 + }, + { + "epoch": 2.3203892758324587, + "grad_norm": 0.6729198098182678, + "learning_rate": 9.478084785072234e-05, + "loss": 0.029, + "step": 1520 + }, + { + "epoch": 2.3356549947524092, + "grad_norm": 0.5485087633132935, + "learning_rate": 9.46616830366238e-05, + "loss": 0.0298, + "step": 1530 + }, + { + "epoch": 2.3509207136723593, + "grad_norm": 0.3160726726055145, + "learning_rate": 9.454124979346391e-05, + "loss": 0.0223, + "step": 1540 + }, + { + "epoch": 2.36618643259231, + "grad_norm": 0.20146960020065308, + "learning_rate": 9.441955154164727e-05, + "loss": 0.0181, + "step": 1550 + }, + { + "epoch": 2.36618643259231, + "eval_loss": 0.041491299867630005, + "eval_runtime": 159.4346, + "eval_samples_per_second": 7.307, + "eval_steps_per_second": 7.307, + "step": 1550 + }, + { + "epoch": 2.3814521515122604, + "grad_norm": 0.29136916995048523, + "learning_rate": 9.429659173750576e-05, + "loss": 0.0306, + "step": 1560 + }, + { + "epoch": 2.3967178704322105, + "grad_norm": 0.4018327593803406, + "learning_rate": 9.417237387320039e-05, + "loss": 0.028, + "step": 1570 + }, + { + "epoch": 2.411983589352161, + "grad_norm": 0.5340149402618408, + "learning_rate": 9.404690147662218e-05, + "loss": 0.0342, + "step": 1580 + }, + { + "epoch": 2.4272493082721116, + "grad_norm": 0.3677927255630493, + "learning_rate": 9.392017811129188e-05, + "loss": 0.0291, + "step": 1590 + }, + { + "epoch": 2.4425150271920617, + "grad_norm": 0.2398582547903061, + "learning_rate": 9.379220737625877e-05, + "loss": 0.0253, + "step": 1600 + }, + { + "epoch": 2.4425150271920617, + "eval_loss": 0.03713800385594368, + "eval_runtime": 159.4675, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 1600 + }, + { + "epoch": 2.4577807461120122, + "grad_norm": 0.2909869849681854, + "learning_rate": 9.36629929059985e-05, + "loss": 0.0276, + "step": 1610 + }, + { + "epoch": 2.473046465031963, + "grad_norm": 0.1664232462644577, + "learning_rate": 9.353253837030985e-05, + "loss": 0.028, + "step": 1620 + }, + { + "epoch": 2.488312183951913, + "grad_norm": 0.43266087770462036, + "learning_rate": 9.340084747421047e-05, + "loss": 0.0356, + "step": 1630 + }, + { + "epoch": 2.5035779028718634, + "grad_norm": 0.1772838681936264, + "learning_rate": 9.326792395783169e-05, + "loss": 0.0343, + "step": 1640 + }, + { + "epoch": 2.518843621791814, + "grad_norm": 0.12271358072757721, + "learning_rate": 9.313377159631225e-05, + "loss": 0.0267, + "step": 1650 + }, + { + "epoch": 2.518843621791814, + "eval_loss": 0.03684351220726967, + "eval_runtime": 159.5459, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 1650 + }, + { + "epoch": 2.534109340711764, + "grad_norm": 0.42832237482070923, + "learning_rate": 9.299839419969118e-05, + "loss": 0.0216, + "step": 1660 + }, + { + "epoch": 2.5493750596317146, + "grad_norm": 0.19247139990329742, + "learning_rate": 9.286179561279948e-05, + "loss": 0.0142, + "step": 1670 + }, + { + "epoch": 2.5646407785516647, + "grad_norm": 0.4302532970905304, + "learning_rate": 9.2723979715151e-05, + "loss": 0.0242, + "step": 1680 + }, + { + "epoch": 2.5799064974716153, + "grad_norm": 0.18346640467643738, + "learning_rate": 9.258495042083221e-05, + "loss": 0.0309, + "step": 1690 + }, + { + "epoch": 2.595172216391566, + "grad_norm": 0.22617080807685852, + "learning_rate": 9.244471167839109e-05, + "loss": 0.0275, + "step": 1700 + }, + { + "epoch": 2.595172216391566, + "eval_loss": 0.038051262497901917, + "eval_runtime": 159.6023, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 1700 + }, + { + "epoch": 2.610437935311516, + "grad_norm": 0.24917519092559814, + "learning_rate": 9.230326747072486e-05, + "loss": 0.0212, + "step": 1710 + }, + { + "epoch": 2.6257036542314665, + "grad_norm": 0.4135741591453552, + "learning_rate": 9.216062181496712e-05, + "loss": 0.0299, + "step": 1720 + }, + { + "epoch": 2.640969373151417, + "grad_norm": 0.12938198447227478, + "learning_rate": 9.201677876237344e-05, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 2.656235092071367, + "grad_norm": 0.36285555362701416, + "learning_rate": 9.187174239820658e-05, + "loss": 0.0349, + "step": 1740 + }, + { + "epoch": 2.6715008109913176, + "grad_norm": 0.22116300463676453, + "learning_rate": 9.172551684162025e-05, + "loss": 0.0242, + "step": 1750 + }, + { + "epoch": 2.6715008109913176, + "eval_loss": 0.037581272423267365, + "eval_runtime": 159.5531, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 1750 + }, + { + "epoch": 2.686766529911268, + "grad_norm": 0.33238115906715393, + "learning_rate": 9.157810624554228e-05, + "loss": 0.0302, + "step": 1760 + }, + { + "epoch": 2.7020322488312183, + "grad_norm": 0.20207710564136505, + "learning_rate": 9.14295147965566e-05, + "loss": 0.0334, + "step": 1770 + }, + { + "epoch": 2.717297967751169, + "grad_norm": 0.18509544432163239, + "learning_rate": 9.127974671478432e-05, + "loss": 0.0228, + "step": 1780 + }, + { + "epoch": 2.732563686671119, + "grad_norm": 0.2924479842185974, + "learning_rate": 9.112880625376392e-05, + "loss": 0.035, + "step": 1790 + }, + { + "epoch": 2.7478294055910695, + "grad_norm": 0.2171526998281479, + "learning_rate": 9.097669770033044e-05, + "loss": 0.0277, + "step": 1800 + }, + { + "epoch": 2.7478294055910695, + "eval_loss": 0.03815087676048279, + "eval_runtime": 159.5477, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 1800 + }, + { + "epoch": 2.76309512451102, + "grad_norm": 0.16223770380020142, + "learning_rate": 9.08234253744937e-05, + "loss": 0.0281, + "step": 1810 + }, + { + "epoch": 2.7783608434309706, + "grad_norm": 0.2031286656856537, + "learning_rate": 9.066899362931562e-05, + "loss": 0.0243, + "step": 1820 + }, + { + "epoch": 2.7936265623509207, + "grad_norm": 0.39663904905319214, + "learning_rate": 9.051340685078664e-05, + "loss": 0.0246, + "step": 1830 + }, + { + "epoch": 2.808892281270871, + "grad_norm": 0.23519934713840485, + "learning_rate": 9.035666945770107e-05, + "loss": 0.0281, + "step": 1840 + }, + { + "epoch": 2.8241580001908213, + "grad_norm": 0.4495214819908142, + "learning_rate": 9.019878590153166e-05, + "loss": 0.0315, + "step": 1850 + }, + { + "epoch": 2.8241580001908213, + "eval_loss": 0.03644520416855812, + "eval_runtime": 159.5095, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 1850 + }, + { + "epoch": 2.839423719110772, + "grad_norm": 0.36941808462142944, + "learning_rate": 9.003976066630311e-05, + "loss": 0.0326, + "step": 1860 + }, + { + "epoch": 2.8546894380307224, + "grad_norm": 0.24380412697792053, + "learning_rate": 8.98795982684648e-05, + "loss": 0.0225, + "step": 1870 + }, + { + "epoch": 2.8699551569506725, + "grad_norm": 0.30239325761795044, + "learning_rate": 8.971830325676245e-05, + "loss": 0.0215, + "step": 1880 + }, + { + "epoch": 2.885220875870623, + "grad_norm": 0.5324017405509949, + "learning_rate": 8.955588021210896e-05, + "loss": 0.0364, + "step": 1890 + }, + { + "epoch": 2.9004865947905736, + "grad_norm": 0.2610476315021515, + "learning_rate": 8.939233374745433e-05, + "loss": 0.0276, + "step": 1900 + }, + { + "epoch": 2.9004865947905736, + "eval_loss": 0.036984920501708984, + "eval_runtime": 159.5172, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 7.303, + "step": 1900 + }, + { + "epoch": 2.9157523137105237, + "grad_norm": 0.25521320104599, + "learning_rate": 8.92276685076546e-05, + "loss": 0.0315, + "step": 1910 + }, + { + "epoch": 2.9310180326304742, + "grad_norm": 0.12682771682739258, + "learning_rate": 8.906188916933998e-05, + "loss": 0.0275, + "step": 1920 + }, + { + "epoch": 2.9462837515504248, + "grad_norm": 0.36188557744026184, + "learning_rate": 8.889500044078198e-05, + "loss": 0.0353, + "step": 1930 + }, + { + "epoch": 2.961549470470375, + "grad_norm": 0.13340769708156586, + "learning_rate": 8.872700706175975e-05, + "loss": 0.0287, + "step": 1940 + }, + { + "epoch": 2.9768151893903254, + "grad_norm": 0.2620132863521576, + "learning_rate": 8.85579138034254e-05, + "loss": 0.0333, + "step": 1950 + }, + { + "epoch": 2.9768151893903254, + "eval_loss": 0.03623765707015991, + "eval_runtime": 159.5836, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 7.3, + "step": 1950 + }, + { + "epoch": 2.9920809083102755, + "grad_norm": 0.14295534789562225, + "learning_rate": 8.838772546816856e-05, + "loss": 0.0242, + "step": 1960 + }, + { + "epoch": 3.007346627230226, + "grad_norm": 0.09052982181310654, + "learning_rate": 8.821644688947993e-05, + "loss": 0.0154, + "step": 1970 + }, + { + "epoch": 3.0226123461501766, + "grad_norm": 0.3653980791568756, + "learning_rate": 8.804408293181407e-05, + "loss": 0.022, + "step": 1980 + }, + { + "epoch": 3.0378780650701267, + "grad_norm": 0.16551321744918823, + "learning_rate": 8.787063849045118e-05, + "loss": 0.019, + "step": 1990 + }, + { + "epoch": 3.0531437839900772, + "grad_norm": 0.1436963826417923, + "learning_rate": 8.76961184913581e-05, + "loss": 0.013, + "step": 2000 + }, + { + "epoch": 3.0531437839900772, + "eval_loss": 0.040799323469400406, + "eval_runtime": 159.6152, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 2000 + }, + { + "epoch": 3.068409502910028, + "grad_norm": 0.4144124984741211, + "learning_rate": 8.752052789104844e-05, + "loss": 0.0123, + "step": 2010 + }, + { + "epoch": 3.083675221829978, + "grad_norm": 0.45267727971076965, + "learning_rate": 8.734387167644171e-05, + "loss": 0.0145, + "step": 2020 + }, + { + "epoch": 3.0989409407499284, + "grad_norm": 0.2996237874031067, + "learning_rate": 8.716615486472183e-05, + "loss": 0.0149, + "step": 2030 + }, + { + "epoch": 3.114206659669879, + "grad_norm": 0.19144207239151, + "learning_rate": 8.698738250319451e-05, + "loss": 0.02, + "step": 2040 + }, + { + "epoch": 3.129472378589829, + "grad_norm": 0.3586185574531555, + "learning_rate": 8.680755966914401e-05, + "loss": 0.0146, + "step": 2050 + }, + { + "epoch": 3.129472378589829, + "eval_loss": 0.0397295206785202, + "eval_runtime": 159.5885, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 7.3, + "step": 2050 + }, + { + "epoch": 3.1447380975097796, + "grad_norm": 0.12905095517635345, + "learning_rate": 8.662669146968879e-05, + "loss": 0.013, + "step": 2060 + }, + { + "epoch": 3.16000381642973, + "grad_norm": 0.25925204157829285, + "learning_rate": 8.644478304163666e-05, + "loss": 0.0149, + "step": 2070 + }, + { + "epoch": 3.1752695353496803, + "grad_norm": 0.5712788701057434, + "learning_rate": 8.626183955133875e-05, + "loss": 0.0168, + "step": 2080 + }, + { + "epoch": 3.190535254269631, + "grad_norm": 0.373526006937027, + "learning_rate": 8.60778661945428e-05, + "loss": 0.0144, + "step": 2090 + }, + { + "epoch": 3.2058009731895813, + "grad_norm": 0.15406440198421478, + "learning_rate": 8.589286819624569e-05, + "loss": 0.0168, + "step": 2100 + }, + { + "epoch": 3.2058009731895813, + "eval_loss": 0.04190153256058693, + "eval_runtime": 159.6132, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 2100 + }, + { + "epoch": 3.2210666921095314, + "grad_norm": 0.09211976826190948, + "learning_rate": 8.570685081054486e-05, + "loss": 0.0136, + "step": 2110 + }, + { + "epoch": 3.236332411029482, + "grad_norm": 0.30212169885635376, + "learning_rate": 8.551981932048931e-05, + "loss": 0.0153, + "step": 2120 + }, + { + "epoch": 3.251598129949432, + "grad_norm": 0.06508340686559677, + "learning_rate": 8.533177903792937e-05, + "loss": 0.0132, + "step": 2130 + }, + { + "epoch": 3.2668638488693826, + "grad_norm": 0.3660019636154175, + "learning_rate": 8.5142735303366e-05, + "loss": 0.0114, + "step": 2140 + }, + { + "epoch": 3.282129567789333, + "grad_norm": 0.7414308190345764, + "learning_rate": 8.495269348579895e-05, + "loss": 0.0202, + "step": 2150 + }, + { + "epoch": 3.282129567789333, + "eval_loss": 0.04507509246468544, + "eval_runtime": 159.4806, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 2150 + }, + { + "epoch": 3.2973952867092833, + "grad_norm": 0.36749276518821716, + "learning_rate": 8.47616589825744e-05, + "loss": 0.0193, + "step": 2160 + }, + { + "epoch": 3.312661005629234, + "grad_norm": 0.31926947832107544, + "learning_rate": 8.456963721923165e-05, + "loss": 0.0161, + "step": 2170 + }, + { + "epoch": 3.3279267245491844, + "grad_norm": 0.41529110074043274, + "learning_rate": 8.437663364934901e-05, + "loss": 0.0273, + "step": 2180 + }, + { + "epoch": 3.3431924434691345, + "grad_norm": 0.2972193956375122, + "learning_rate": 8.418265375438884e-05, + "loss": 0.0138, + "step": 2190 + }, + { + "epoch": 3.358458162389085, + "grad_norm": 0.252144455909729, + "learning_rate": 8.398770304354204e-05, + "loss": 0.0197, + "step": 2200 + }, + { + "epoch": 3.358458162389085, + "eval_loss": 0.039750855416059494, + "eval_runtime": 159.5116, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 2200 + }, + { + "epoch": 3.3737238813090356, + "grad_norm": 0.38356465101242065, + "learning_rate": 8.379178705357143e-05, + "loss": 0.021, + "step": 2210 + }, + { + "epoch": 3.3889896002289857, + "grad_norm": 0.24160213768482208, + "learning_rate": 8.359491134865461e-05, + "loss": 0.0148, + "step": 2220 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.2044498771429062, + "learning_rate": 8.339708152022585e-05, + "loss": 0.0177, + "step": 2230 + }, + { + "epoch": 3.4195210380688867, + "grad_norm": 0.23636089265346527, + "learning_rate": 8.319830318681734e-05, + "loss": 0.0155, + "step": 2240 + }, + { + "epoch": 3.434786756988837, + "grad_norm": 0.3427311182022095, + "learning_rate": 8.29985819938996e-05, + "loss": 0.0167, + "step": 2250 + }, + { + "epoch": 3.434786756988837, + "eval_loss": 0.04210291802883148, + "eval_runtime": 159.4651, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 2250 + }, + { + "epoch": 3.4500524759087874, + "grad_norm": 0.3438746929168701, + "learning_rate": 8.279792361372114e-05, + "loss": 0.0134, + "step": 2260 + }, + { + "epoch": 3.465318194828738, + "grad_norm": 0.4624626040458679, + "learning_rate": 8.259633374514736e-05, + "loss": 0.0126, + "step": 2270 + }, + { + "epoch": 3.480583913748688, + "grad_norm": 0.36924147605895996, + "learning_rate": 8.239381811349874e-05, + "loss": 0.0186, + "step": 2280 + }, + { + "epoch": 3.4958496326686386, + "grad_norm": 0.3343561887741089, + "learning_rate": 8.219038247038819e-05, + "loss": 0.0177, + "step": 2290 + }, + { + "epoch": 3.5111153515885887, + "grad_norm": 0.4497634470462799, + "learning_rate": 8.198603259355768e-05, + "loss": 0.0224, + "step": 2300 + }, + { + "epoch": 3.5111153515885887, + "eval_loss": 0.04125366359949112, + "eval_runtime": 159.4766, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 2300 + }, + { + "epoch": 3.526381070508539, + "grad_norm": 0.4223650395870209, + "learning_rate": 8.178077428671425e-05, + "loss": 0.0219, + "step": 2310 + }, + { + "epoch": 3.5416467894284898, + "grad_norm": 0.4060512185096741, + "learning_rate": 8.157461337936506e-05, + "loss": 0.0135, + "step": 2320 + }, + { + "epoch": 3.5569125083484403, + "grad_norm": 0.2580900490283966, + "learning_rate": 8.136755572665187e-05, + "loss": 0.0174, + "step": 2330 + }, + { + "epoch": 3.5721782272683904, + "grad_norm": 0.2188238650560379, + "learning_rate": 8.115960720918476e-05, + "loss": 0.0133, + "step": 2340 + }, + { + "epoch": 3.587443946188341, + "grad_norm": 0.21322885155677795, + "learning_rate": 8.095077373287516e-05, + "loss": 0.0173, + "step": 2350 + }, + { + "epoch": 3.587443946188341, + "eval_loss": 0.04570980370044708, + "eval_runtime": 159.4973, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 2350 + }, + { + "epoch": 3.602709665108291, + "grad_norm": 0.3630387485027313, + "learning_rate": 8.074106122876803e-05, + "loss": 0.0214, + "step": 2360 + }, + { + "epoch": 3.6179753840282416, + "grad_norm": 0.21328367292881012, + "learning_rate": 8.053047565287346e-05, + "loss": 0.0223, + "step": 2370 + }, + { + "epoch": 3.633241102948192, + "grad_norm": 0.4291078448295593, + "learning_rate": 8.031902298599749e-05, + "loss": 0.0113, + "step": 2380 + }, + { + "epoch": 3.6485068218681422, + "grad_norm": 0.3949430584907532, + "learning_rate": 8.010670923357228e-05, + "loss": 0.0107, + "step": 2390 + }, + { + "epoch": 3.663772540788093, + "grad_norm": 0.2193780243396759, + "learning_rate": 7.989354042548557e-05, + "loss": 0.0143, + "step": 2400 + }, + { + "epoch": 3.663772540788093, + "eval_loss": 0.044236745685338974, + "eval_runtime": 159.4874, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 2400 + }, + { + "epoch": 3.679038259708043, + "grad_norm": 0.11936675012111664, + "learning_rate": 7.967952261590935e-05, + "loss": 0.0217, + "step": 2410 + }, + { + "epoch": 3.6943039786279934, + "grad_norm": 0.28958603739738464, + "learning_rate": 7.946466188312796e-05, + "loss": 0.0206, + "step": 2420 + }, + { + "epoch": 3.709569697547944, + "grad_norm": 0.3468035161495209, + "learning_rate": 7.92489643293655e-05, + "loss": 0.0154, + "step": 2430 + }, + { + "epoch": 3.7248354164678945, + "grad_norm": 0.3216131627559662, + "learning_rate": 7.903243608061246e-05, + "loss": 0.0125, + "step": 2440 + }, + { + "epoch": 3.7401011353878446, + "grad_norm": 0.4426937997341156, + "learning_rate": 7.881508328645175e-05, + "loss": 0.0179, + "step": 2450 + }, + { + "epoch": 3.7401011353878446, + "eval_loss": 0.042183805257081985, + "eval_runtime": 159.5215, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 7.303, + "step": 2450 + }, + { + "epoch": 3.755366854307795, + "grad_norm": 0.1870480328798294, + "learning_rate": 7.859691211988408e-05, + "loss": 0.0202, + "step": 2460 + }, + { + "epoch": 3.7706325732277453, + "grad_norm": 0.23078645765781403, + "learning_rate": 7.83779287771526e-05, + "loss": 0.016, + "step": 2470 + }, + { + "epoch": 3.785898292147696, + "grad_norm": 0.5037941932678223, + "learning_rate": 7.815813947756697e-05, + "loss": 0.0228, + "step": 2480 + }, + { + "epoch": 3.8011640110676463, + "grad_norm": 0.44231465458869934, + "learning_rate": 7.793755046332665e-05, + "loss": 0.0177, + "step": 2490 + }, + { + "epoch": 3.8164297299875964, + "grad_norm": 0.286805659532547, + "learning_rate": 7.771616799934371e-05, + "loss": 0.0178, + "step": 2500 + }, + { + "epoch": 3.8164297299875964, + "eval_loss": 0.04359288513660431, + "eval_runtime": 159.548, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 2500 + }, + { + "epoch": 3.831695448907547, + "grad_norm": 0.364823579788208, + "learning_rate": 7.749399837306484e-05, + "loss": 0.0151, + "step": 2510 + }, + { + "epoch": 3.8469611678274975, + "grad_norm": 0.6462925672531128, + "learning_rate": 7.727104789429278e-05, + "loss": 0.0211, + "step": 2520 + }, + { + "epoch": 3.8622268867474476, + "grad_norm": 0.7997065782546997, + "learning_rate": 7.704732289500716e-05, + "loss": 0.0204, + "step": 2530 + }, + { + "epoch": 3.877492605667398, + "grad_norm": 0.18169890344142914, + "learning_rate": 7.68228297291846e-05, + "loss": 0.014, + "step": 2540 + }, + { + "epoch": 3.8927583245873487, + "grad_norm": 0.25392216444015503, + "learning_rate": 7.659757477261833e-05, + "loss": 0.0136, + "step": 2550 + }, + { + "epoch": 3.8927583245873487, + "eval_loss": 0.04501135274767876, + "eval_runtime": 159.58, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 7.3, + "step": 2550 + }, + { + "epoch": 3.908024043507299, + "grad_norm": 0.13438932597637177, + "learning_rate": 7.637156442273705e-05, + "loss": 0.0186, + "step": 2560 + }, + { + "epoch": 3.9232897624272494, + "grad_norm": 0.6526336073875427, + "learning_rate": 7.614480509842325e-05, + "loss": 0.0166, + "step": 2570 + }, + { + "epoch": 3.9385554813471995, + "grad_norm": 0.12195435911417007, + "learning_rate": 7.591730323983099e-05, + "loss": 0.0194, + "step": 2580 + }, + { + "epoch": 3.95382120026715, + "grad_norm": 0.22584785521030426, + "learning_rate": 7.568906530820282e-05, + "loss": 0.0173, + "step": 2590 + }, + { + "epoch": 3.9690869191871005, + "grad_norm": 0.6468820571899414, + "learning_rate": 7.546009778568641e-05, + "loss": 0.0193, + "step": 2600 + }, + { + "epoch": 3.9690869191871005, + "eval_loss": 0.03968249261379242, + "eval_runtime": 159.5761, + "eval_samples_per_second": 7.301, + "eval_steps_per_second": 7.301, + "step": 2600 + }, + { + "epoch": 3.984352638107051, + "grad_norm": 0.1573544293642044, + "learning_rate": 7.523040717515049e-05, + "loss": 0.0328, + "step": 2610 + }, + { + "epoch": 3.999618357027001, + "grad_norm": 0.6769170165061951, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0213, + "step": 2620 + }, + { + "epoch": 4.014884075946951, + "grad_norm": 0.11733292788267136, + "learning_rate": 7.476888280399097e-05, + "loss": 0.0094, + "step": 2630 + }, + { + "epoch": 4.030149794866902, + "grad_norm": 0.039417609572410583, + "learning_rate": 7.453706215104461e-05, + "loss": 0.0066, + "step": 2640 + }, + { + "epoch": 4.045415513786852, + "grad_norm": 0.4035051465034485, + "learning_rate": 7.430454462506084e-05, + "loss": 0.009, + "step": 2650 + }, + { + "epoch": 4.045415513786852, + "eval_loss": 0.04601183161139488, + "eval_runtime": 159.6208, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 2650 + }, + { + "epoch": 4.060681232706803, + "grad_norm": 0.1853918582201004, + "learning_rate": 7.407133682973144e-05, + "loss": 0.008, + "step": 2660 + }, + { + "epoch": 4.0759469516267535, + "grad_norm": 0.1659402996301651, + "learning_rate": 7.383744538835236e-05, + "loss": 0.0087, + "step": 2670 + }, + { + "epoch": 4.091212670546703, + "grad_norm": 0.07168131321668625, + "learning_rate": 7.360287694363566e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 4.106478389466654, + "grad_norm": 0.1309129148721695, + "learning_rate": 7.33676381575209e-05, + "loss": 0.0079, + "step": 2690 + }, + { + "epoch": 4.121744108386604, + "grad_norm": 0.078034408390522, + "learning_rate": 7.313173571098585e-05, + "loss": 0.0107, + "step": 2700 + }, + { + "epoch": 4.121744108386604, + "eval_loss": 0.046977698802948, + "eval_runtime": 159.4946, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 2700 + }, + { + "epoch": 4.137009827306555, + "grad_norm": 0.35108262300491333, + "learning_rate": 7.289517630385688e-05, + "loss": 0.0112, + "step": 2710 + }, + { + "epoch": 4.152275546226505, + "grad_norm": 0.3373805582523346, + "learning_rate": 7.265796665461844e-05, + "loss": 0.0051, + "step": 2720 + }, + { + "epoch": 4.167541265146456, + "grad_norm": 0.1995253711938858, + "learning_rate": 7.242011350022254e-05, + "loss": 0.006, + "step": 2730 + }, + { + "epoch": 4.1828069840664055, + "grad_norm": 0.05416731536388397, + "learning_rate": 7.21816235958972e-05, + "loss": 0.01, + "step": 2740 + }, + { + "epoch": 4.198072702986356, + "grad_norm": 0.046558890491724014, + "learning_rate": 7.194250371495467e-05, + "loss": 0.0038, + "step": 2750 + }, + { + "epoch": 4.198072702986356, + "eval_loss": 0.048399168998003006, + "eval_runtime": 159.4875, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 2750 + }, + { + "epoch": 4.213338421906307, + "grad_norm": 0.28177809715270996, + "learning_rate": 7.170276064859909e-05, + "loss": 0.0102, + "step": 2760 + }, + { + "epoch": 4.228604140826257, + "grad_norm": 0.22862623631954193, + "learning_rate": 7.146240120573358e-05, + "loss": 0.0141, + "step": 2770 + }, + { + "epoch": 4.243869859746208, + "grad_norm": 0.1651599407196045, + "learning_rate": 7.122143221276685e-05, + "loss": 0.0052, + "step": 2780 + }, + { + "epoch": 4.259135578666158, + "grad_norm": 0.3874899744987488, + "learning_rate": 7.097986051341934e-05, + "loss": 0.0067, + "step": 2790 + }, + { + "epoch": 4.274401297586108, + "grad_norm": 0.03585471212863922, + "learning_rate": 7.073769296852888e-05, + "loss": 0.0078, + "step": 2800 + }, + { + "epoch": 4.274401297586108, + "eval_loss": 0.05307689681649208, + "eval_runtime": 159.5806, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 7.3, + "step": 2800 + }, + { + "epoch": 4.289667016506058, + "grad_norm": 0.15614160895347595, + "learning_rate": 7.049493645585579e-05, + "loss": 0.0081, + "step": 2810 + }, + { + "epoch": 4.304932735426009, + "grad_norm": 0.1412787139415741, + "learning_rate": 7.025159786988758e-05, + "loss": 0.0068, + "step": 2820 + }, + { + "epoch": 4.3201984543459595, + "grad_norm": 0.07446057349443436, + "learning_rate": 7.00076841216431e-05, + "loss": 0.0056, + "step": 2830 + }, + { + "epoch": 4.33546417326591, + "grad_norm": 0.20580802857875824, + "learning_rate": 6.976320213847633e-05, + "loss": 0.0086, + "step": 2840 + }, + { + "epoch": 4.350729892185861, + "grad_norm": 0.2521199882030487, + "learning_rate": 6.951815886387955e-05, + "loss": 0.0105, + "step": 2850 + }, + { + "epoch": 4.350729892185861, + "eval_loss": 0.05161132290959358, + "eval_runtime": 159.5458, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 2850 + }, + { + "epoch": 4.36599561110581, + "grad_norm": 0.21899621188640594, + "learning_rate": 6.927256125728624e-05, + "loss": 0.01, + "step": 2860 + }, + { + "epoch": 4.381261330025761, + "grad_norm": 0.04916749149560928, + "learning_rate": 6.902641629387331e-05, + "loss": 0.0046, + "step": 2870 + }, + { + "epoch": 4.396527048945711, + "grad_norm": 0.1906789094209671, + "learning_rate": 6.87797309643631e-05, + "loss": 0.0065, + "step": 2880 + }, + { + "epoch": 4.411792767865662, + "grad_norm": 0.007664407137781382, + "learning_rate": 6.853251227482479e-05, + "loss": 0.0066, + "step": 2890 + }, + { + "epoch": 4.427058486785612, + "grad_norm": 0.6737648248672485, + "learning_rate": 6.828476724647542e-05, + "loss": 0.0067, + "step": 2900 + }, + { + "epoch": 4.427058486785612, + "eval_loss": 0.059585120528936386, + "eval_runtime": 159.516, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 7.303, + "step": 2900 + }, + { + "epoch": 4.442324205705562, + "grad_norm": 0.11023423075675964, + "learning_rate": 6.803650291548052e-05, + "loss": 0.0119, + "step": 2910 + }, + { + "epoch": 4.457589924625513, + "grad_norm": 0.325226753950119, + "learning_rate": 6.77877263327542e-05, + "loss": 0.0109, + "step": 2920 + }, + { + "epoch": 4.472855643545463, + "grad_norm": 0.12567752599716187, + "learning_rate": 6.753844456375899e-05, + "loss": 0.0096, + "step": 2930 + }, + { + "epoch": 4.488121362465414, + "grad_norm": 0.12976552546024323, + "learning_rate": 6.728866468830513e-05, + "loss": 0.0082, + "step": 2940 + }, + { + "epoch": 4.503387081385364, + "grad_norm": 0.22063468396663666, + "learning_rate": 6.703839380034946e-05, + "loss": 0.0082, + "step": 2950 + }, + { + "epoch": 4.503387081385364, + "eval_loss": 0.05170449614524841, + "eval_runtime": 159.47, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 2950 + }, + { + "epoch": 4.518652800305315, + "grad_norm": 0.048917531967163086, + "learning_rate": 6.6787639007794e-05, + "loss": 0.0083, + "step": 2960 + }, + { + "epoch": 4.5339185192252645, + "grad_norm": 0.18310528993606567, + "learning_rate": 6.653640743228409e-05, + "loss": 0.0058, + "step": 2970 + }, + { + "epoch": 4.549184238145215, + "grad_norm": 0.2933163046836853, + "learning_rate": 6.628470620900611e-05, + "loss": 0.0128, + "step": 2980 + }, + { + "epoch": 4.5644499570651655, + "grad_norm": 0.6412350535392761, + "learning_rate": 6.603254248648478e-05, + "loss": 0.0078, + "step": 2990 + }, + { + "epoch": 4.579715675985116, + "grad_norm": 0.62347811460495, + "learning_rate": 6.57799234263802e-05, + "loss": 0.0121, + "step": 3000 + }, + { + "epoch": 4.579715675985116, + "eval_loss": 0.052560292184352875, + "eval_runtime": 159.5486, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 3000 + }, + { + "epoch": 4.594981394905067, + "grad_norm": 0.25548842549324036, + "learning_rate": 6.552685620328447e-05, + "loss": 0.0086, + "step": 3010 + }, + { + "epoch": 4.610247113825016, + "grad_norm": 0.3364918529987335, + "learning_rate": 6.527334800451786e-05, + "loss": 0.0048, + "step": 3020 + }, + { + "epoch": 4.625512832744967, + "grad_norm": 0.05494154244661331, + "learning_rate": 6.501940602992474e-05, + "loss": 0.0133, + "step": 3030 + }, + { + "epoch": 4.640778551664917, + "grad_norm": 0.21277029812335968, + "learning_rate": 6.476503749166904e-05, + "loss": 0.0077, + "step": 3040 + }, + { + "epoch": 4.656044270584868, + "grad_norm": 0.3750063478946686, + "learning_rate": 6.45102496140295e-05, + "loss": 0.0044, + "step": 3050 + }, + { + "epoch": 4.656044270584868, + "eval_loss": 0.04872344806790352, + "eval_runtime": 159.5341, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 7.303, + "step": 3050 + }, + { + "epoch": 4.6713099895048185, + "grad_norm": 2.427780866622925, + "learning_rate": 6.42550496331944e-05, + "loss": 0.0101, + "step": 3060 + }, + { + "epoch": 4.686575708424769, + "grad_norm": 0.12002643197774887, + "learning_rate": 6.399944479705616e-05, + "loss": 0.0092, + "step": 3070 + }, + { + "epoch": 4.701841427344719, + "grad_norm": 0.3205980360507965, + "learning_rate": 6.374344236500532e-05, + "loss": 0.012, + "step": 3080 + }, + { + "epoch": 4.717107146264669, + "grad_norm": 0.20002758502960205, + "learning_rate": 6.348704960772462e-05, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 4.73237286518462, + "grad_norm": 0.058489710092544556, + "learning_rate": 6.323027380698226e-05, + "loss": 0.0107, + "step": 3100 + }, + { + "epoch": 4.73237286518462, + "eval_loss": 0.04949155077338219, + "eval_runtime": 159.4724, + "eval_samples_per_second": 7.305, + "eval_steps_per_second": 7.305, + "step": 3100 + }, + { + "epoch": 4.74763858410457, + "grad_norm": 0.21070730686187744, + "learning_rate": 6.297312225542519e-05, + "loss": 0.0097, + "step": 3110 + }, + { + "epoch": 4.762904303024521, + "grad_norm": 0.3759920597076416, + "learning_rate": 6.271560225637204e-05, + "loss": 0.0114, + "step": 3120 + }, + { + "epoch": 4.7781700219444705, + "grad_norm": 0.06371816992759705, + "learning_rate": 6.245772112360568e-05, + "loss": 0.0051, + "step": 3130 + }, + { + "epoch": 4.793435740864421, + "grad_norm": 0.5818877220153809, + "learning_rate": 6.21994861811654e-05, + "loss": 0.0136, + "step": 3140 + }, + { + "epoch": 4.808701459784372, + "grad_norm": 0.8154250383377075, + "learning_rate": 6.194090476313904e-05, + "loss": 0.0146, + "step": 3150 + }, + { + "epoch": 4.808701459784372, + "eval_loss": 0.05170228332281113, + "eval_runtime": 159.4635, + "eval_samples_per_second": 7.306, + "eval_steps_per_second": 7.306, + "step": 3150 + }, + { + "epoch": 4.823967178704322, + "grad_norm": 0.048479437828063965, + "learning_rate": 6.168198421345459e-05, + "loss": 0.0124, + "step": 3160 + }, + { + "epoch": 4.839232897624273, + "grad_norm": 0.6113202571868896, + "learning_rate": 6.142273188567173e-05, + "loss": 0.0115, + "step": 3170 + }, + { + "epoch": 4.854498616544223, + "grad_norm": 0.12574650347232819, + "learning_rate": 6.116315514277285e-05, + "loss": 0.0111, + "step": 3180 + }, + { + "epoch": 4.869764335464174, + "grad_norm": 0.3447161614894867, + "learning_rate": 6.090326135695403e-05, + "loss": 0.0126, + "step": 3190 + }, + { + "epoch": 4.885030054384123, + "grad_norm": 0.20263008773326874, + "learning_rate": 6.0643057909415634e-05, + "loss": 0.0098, + "step": 3200 + }, + { + "epoch": 4.885030054384123, + "eval_loss": 0.05373184755444527, + "eval_runtime": 159.5022, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 3200 + }, + { + "epoch": 4.900295773304074, + "grad_norm": 0.17962191998958588, + "learning_rate": 6.038255219015265e-05, + "loss": 0.0119, + "step": 3210 + }, + { + "epoch": 4.9155614922240245, + "grad_norm": 0.043420616537332535, + "learning_rate": 6.012175159774488e-05, + "loss": 0.0164, + "step": 3220 + }, + { + "epoch": 4.930827211143975, + "grad_norm": 0.3898150622844696, + "learning_rate": 5.986066353914673e-05, + "loss": 0.012, + "step": 3230 + }, + { + "epoch": 4.946092930063926, + "grad_norm": 0.4613206684589386, + "learning_rate": 5.9599295429476906e-05, + "loss": 0.0105, + "step": 3240 + }, + { + "epoch": 4.961358648983875, + "grad_norm": 0.2853253483772278, + "learning_rate": 5.933765469180779e-05, + "loss": 0.0088, + "step": 3250 + }, + { + "epoch": 4.961358648983875, + "eval_loss": 0.04844510182738304, + "eval_runtime": 159.5639, + "eval_samples_per_second": 7.301, + "eval_steps_per_second": 7.301, + "step": 3250 + }, + { + "epoch": 4.976624367903826, + "grad_norm": 0.14542199671268463, + "learning_rate": 5.907574875695463e-05, + "loss": 0.0094, + "step": 3260 + }, + { + "epoch": 4.991890086823776, + "grad_norm": 0.6809795498847961, + "learning_rate": 5.88135850632645e-05, + "loss": 0.0153, + "step": 3270 + }, + { + "epoch": 5.007155805743727, + "grad_norm": 0.15425284206867218, + "learning_rate": 5.855117105640503e-05, + "loss": 0.0171, + "step": 3280 + }, + { + "epoch": 5.022421524663677, + "grad_norm": 0.13248614966869354, + "learning_rate": 5.8288514189152955e-05, + "loss": 0.0047, + "step": 3290 + }, + { + "epoch": 5.037687243583628, + "grad_norm": 0.09773049503564835, + "learning_rate": 5.802562192118246e-05, + "loss": 0.0041, + "step": 3300 + }, + { + "epoch": 5.037687243583628, + "eval_loss": 0.05482862889766693, + "eval_runtime": 159.5685, + "eval_samples_per_second": 7.301, + "eval_steps_per_second": 7.301, + "step": 3300 + }, + { + "epoch": 5.052952962503578, + "grad_norm": 0.11446662992238998, + "learning_rate": 5.776250171885329e-05, + "loss": 0.0047, + "step": 3310 + }, + { + "epoch": 5.068218681423528, + "grad_norm": 0.1264236718416214, + "learning_rate": 5.749916105499873e-05, + "loss": 0.0056, + "step": 3320 + }, + { + "epoch": 5.083484400343479, + "grad_norm": 0.021841134876012802, + "learning_rate": 5.7235607408713343e-05, + "loss": 0.003, + "step": 3330 + }, + { + "epoch": 5.098750119263429, + "grad_norm": 0.562330424785614, + "learning_rate": 5.697184826514057e-05, + "loss": 0.0037, + "step": 3340 + }, + { + "epoch": 5.11401583818338, + "grad_norm": 0.44594606757164, + "learning_rate": 5.670789111526017e-05, + "loss": 0.0049, + "step": 3350 + }, + { + "epoch": 5.11401583818338, + "eval_loss": 0.0611434169113636, + "eval_runtime": 159.5429, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 3350 + }, + { + "epoch": 5.1292815571033294, + "grad_norm": 0.016270384192466736, + "learning_rate": 5.644374345567542e-05, + "loss": 0.0066, + "step": 3360 + }, + { + "epoch": 5.14454727602328, + "grad_norm": 0.11884745955467224, + "learning_rate": 5.6179412788400255e-05, + "loss": 0.0048, + "step": 3370 + }, + { + "epoch": 5.1598129949432305, + "grad_norm": 0.3594692647457123, + "learning_rate": 5.5914906620646145e-05, + "loss": 0.0089, + "step": 3380 + }, + { + "epoch": 5.175078713863181, + "grad_norm": 0.37823039293289185, + "learning_rate": 5.565023246460896e-05, + "loss": 0.0066, + "step": 3390 + }, + { + "epoch": 5.190344432783132, + "grad_norm": 0.05949282646179199, + "learning_rate": 5.5385397837255556e-05, + "loss": 0.0048, + "step": 3400 + }, + { + "epoch": 5.190344432783132, + "eval_loss": 0.06414221227169037, + "eval_runtime": 159.568, + "eval_samples_per_second": 7.301, + "eval_steps_per_second": 7.301, + "step": 3400 + }, + { + "epoch": 5.205610151703082, + "grad_norm": 0.7599151134490967, + "learning_rate": 5.512041026011031e-05, + "loss": 0.0054, + "step": 3410 + }, + { + "epoch": 5.220875870623032, + "grad_norm": 0.1311825066804886, + "learning_rate": 5.485527725904153e-05, + "loss": 0.0036, + "step": 3420 + }, + { + "epoch": 5.236141589542982, + "grad_norm": 0.5533745288848877, + "learning_rate": 5.4590006364047585e-05, + "loss": 0.0057, + "step": 3430 + }, + { + "epoch": 5.251407308462933, + "grad_norm": 3.0333991050720215, + "learning_rate": 5.432460510904329e-05, + "loss": 0.0081, + "step": 3440 + }, + { + "epoch": 5.2666730273828835, + "grad_norm": 0.13738901913166046, + "learning_rate": 5.405908103164571e-05, + "loss": 0.0025, + "step": 3450 + }, + { + "epoch": 5.2666730273828835, + "eval_loss": 0.06379962712526321, + "eval_runtime": 159.6179, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 3450 + }, + { + "epoch": 5.281938746302834, + "grad_norm": 0.08990368247032166, + "learning_rate": 5.3793441672960166e-05, + "loss": 0.0032, + "step": 3460 + }, + { + "epoch": 5.297204465222784, + "grad_norm": 0.014413976110517979, + "learning_rate": 5.352769457736607e-05, + "loss": 0.0049, + "step": 3470 + }, + { + "epoch": 5.312470184142734, + "grad_norm": 0.16746607422828674, + "learning_rate": 5.326184729230268e-05, + "loss": 0.0035, + "step": 3480 + }, + { + "epoch": 5.327735903062685, + "grad_norm": 0.062102314084768295, + "learning_rate": 5.29959073680547e-05, + "loss": 0.0067, + "step": 3490 + }, + { + "epoch": 5.343001621982635, + "grad_norm": 0.12228365987539291, + "learning_rate": 5.272988235753786e-05, + "loss": 0.0054, + "step": 3500 + }, + { + "epoch": 5.343001621982635, + "eval_loss": 0.062371041625738144, + "eval_runtime": 159.5518, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 3500 + }, + { + "epoch": 5.358267340902586, + "grad_norm": 0.1886071264743805, + "learning_rate": 5.24637798160844e-05, + "loss": 0.0069, + "step": 3510 + }, + { + "epoch": 5.373533059822536, + "grad_norm": 0.17080271244049072, + "learning_rate": 5.2197607301228534e-05, + "loss": 0.0046, + "step": 3520 + }, + { + "epoch": 5.388798778742486, + "grad_norm": 0.13046708703041077, + "learning_rate": 5.193137237249176e-05, + "loss": 0.0038, + "step": 3530 + }, + { + "epoch": 5.404064497662437, + "grad_norm": 0.3127999007701874, + "learning_rate": 5.166508259116818e-05, + "loss": 0.0053, + "step": 3540 + }, + { + "epoch": 5.419330216582387, + "grad_norm": 0.19052395224571228, + "learning_rate": 5.139874552010975e-05, + "loss": 0.0069, + "step": 3550 + }, + { + "epoch": 5.419330216582387, + "eval_loss": 0.06260501593351364, + "eval_runtime": 159.5436, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 3550 + }, + { + "epoch": 5.434595935502338, + "grad_norm": 0.27390000224113464, + "learning_rate": 5.113236872351154e-05, + "loss": 0.0038, + "step": 3560 + }, + { + "epoch": 5.449861654422288, + "grad_norm": 0.010349981486797333, + "learning_rate": 5.086595976669679e-05, + "loss": 0.0063, + "step": 3570 + }, + { + "epoch": 5.465127373342239, + "grad_norm": 0.3470171093940735, + "learning_rate": 5.059952621590216e-05, + "loss": 0.0058, + "step": 3580 + }, + { + "epoch": 5.480393092262188, + "grad_norm": 0.4728798568248749, + "learning_rate": 5.0333075638062776e-05, + "loss": 0.0045, + "step": 3590 + }, + { + "epoch": 5.495658811182139, + "grad_norm": 0.03188423812389374, + "learning_rate": 5.006661560059738e-05, + "loss": 0.0031, + "step": 3600 + }, + { + "epoch": 5.495658811182139, + "eval_loss": 0.06937862187623978, + "eval_runtime": 159.624, + "eval_samples_per_second": 7.298, + "eval_steps_per_second": 7.298, + "step": 3600 + }, + { + "epoch": 5.5109245301020895, + "grad_norm": 0.06070191413164139, + "learning_rate": 4.980015367119336e-05, + "loss": 0.0025, + "step": 3610 + }, + { + "epoch": 5.52619024902204, + "grad_norm": 0.09115032106637955, + "learning_rate": 4.953369741759181e-05, + "loss": 0.0082, + "step": 3620 + }, + { + "epoch": 5.541455967941991, + "grad_norm": 0.028362061828374863, + "learning_rate": 4.9267254407372645e-05, + "loss": 0.0032, + "step": 3630 + }, + { + "epoch": 5.556721686861941, + "grad_norm": 0.46696528792381287, + "learning_rate": 4.9000832207739676e-05, + "loss": 0.0034, + "step": 3640 + }, + { + "epoch": 5.571987405781891, + "grad_norm": 0.17385917901992798, + "learning_rate": 4.873443838530566e-05, + "loss": 0.0109, + "step": 3650 + }, + { + "epoch": 5.571987405781891, + "eval_loss": 0.06586816906929016, + "eval_runtime": 159.6618, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 3650 + }, + { + "epoch": 5.587253124701841, + "grad_norm": 0.24807928502559662, + "learning_rate": 4.846808050587742e-05, + "loss": 0.0034, + "step": 3660 + }, + { + "epoch": 5.602518843621792, + "grad_norm": 0.3347882628440857, + "learning_rate": 4.820176613424095e-05, + "loss": 0.0029, + "step": 3670 + }, + { + "epoch": 5.617784562541742, + "grad_norm": 0.044925060123205185, + "learning_rate": 4.793550283394661e-05, + "loss": 0.0027, + "step": 3680 + }, + { + "epoch": 5.633050281461693, + "grad_norm": 0.2767011821269989, + "learning_rate": 4.766929816709428e-05, + "loss": 0.0024, + "step": 3690 + }, + { + "epoch": 5.648316000381643, + "grad_norm": 0.40417757630348206, + "learning_rate": 4.7403159694118606e-05, + "loss": 0.0015, + "step": 3700 + }, + { + "epoch": 5.648316000381643, + "eval_loss": 0.07630186527967453, + "eval_runtime": 159.5164, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 7.303, + "step": 3700 + }, + { + "epoch": 5.663581719301593, + "grad_norm": 0.22835291922092438, + "learning_rate": 4.713709497357426e-05, + "loss": 0.003, + "step": 3710 + }, + { + "epoch": 5.678847438221544, + "grad_norm": 0.3233691155910492, + "learning_rate": 4.6871111561921314e-05, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 5.694113157141494, + "grad_norm": 0.9546028971672058, + "learning_rate": 4.660521701331057e-05, + "loss": 0.0081, + "step": 3730 + }, + { + "epoch": 5.709378876061445, + "grad_norm": 0.07153519988059998, + "learning_rate": 4.633941887936908e-05, + "loss": 0.0045, + "step": 3740 + }, + { + "epoch": 5.724644594981395, + "grad_norm": 0.10959182679653168, + "learning_rate": 4.607372470898557e-05, + "loss": 0.0034, + "step": 3750 + }, + { + "epoch": 5.724644594981395, + "eval_loss": 0.06623843312263489, + "eval_runtime": 159.5017, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 3750 + }, + { + "epoch": 5.739910313901345, + "grad_norm": 0.178690105676651, + "learning_rate": 4.580814204809618e-05, + "loss": 0.0102, + "step": 3760 + }, + { + "epoch": 5.7551760328212955, + "grad_norm": 0.09123440831899643, + "learning_rate": 4.554267843947007e-05, + "loss": 0.0029, + "step": 3770 + }, + { + "epoch": 5.770441751741246, + "grad_norm": 0.08254484087228775, + "learning_rate": 4.527734142249521e-05, + "loss": 0.0033, + "step": 3780 + }, + { + "epoch": 5.785707470661197, + "grad_norm": 0.20788267254829407, + "learning_rate": 4.501213853296425e-05, + "loss": 0.0058, + "step": 3790 + }, + { + "epoch": 5.800973189581147, + "grad_norm": 0.29946595430374146, + "learning_rate": 4.474707730286055e-05, + "loss": 0.0094, + "step": 3800 + }, + { + "epoch": 5.800973189581147, + "eval_loss": 0.06175835058093071, + "eval_runtime": 159.5065, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 3800 + }, + { + "epoch": 5.816238908501097, + "grad_norm": 0.5739238858222961, + "learning_rate": 4.448216526014419e-05, + "loss": 0.0038, + "step": 3810 + }, + { + "epoch": 5.831504627421047, + "grad_norm": 0.09805881232023239, + "learning_rate": 4.421740992853818e-05, + "loss": 0.0105, + "step": 3820 + }, + { + "epoch": 5.846770346340998, + "grad_norm": 0.05599082633852959, + "learning_rate": 4.395281882731486e-05, + "loss": 0.0053, + "step": 3830 + }, + { + "epoch": 5.8620360652609484, + "grad_norm": 0.12262944132089615, + "learning_rate": 4.368839947108226e-05, + "loss": 0.0037, + "step": 3840 + }, + { + "epoch": 5.877301784180899, + "grad_norm": 0.07163238525390625, + "learning_rate": 4.3424159369570725e-05, + "loss": 0.0027, + "step": 3850 + }, + { + "epoch": 5.877301784180899, + "eval_loss": 0.06425386667251587, + "eval_runtime": 159.541, + "eval_samples_per_second": 7.302, + "eval_steps_per_second": 7.302, + "step": 3850 + }, + { + "epoch": 5.8925675031008495, + "grad_norm": 1.107406497001648, + "learning_rate": 4.3160106027419585e-05, + "loss": 0.0046, + "step": 3860 + }, + { + "epoch": 5.907833222020799, + "grad_norm": 0.3896564245223999, + "learning_rate": 4.289624694396406e-05, + "loss": 0.0043, + "step": 3870 + }, + { + "epoch": 5.92309894094075, + "grad_norm": 0.035553038120269775, + "learning_rate": 4.263258961302232e-05, + "loss": 0.0035, + "step": 3880 + }, + { + "epoch": 5.9383646598607, + "grad_norm": 0.022125933319330215, + "learning_rate": 4.236914152268249e-05, + "loss": 0.0047, + "step": 3890 + }, + { + "epoch": 5.953630378780651, + "grad_norm": 0.2430310696363449, + "learning_rate": 4.210591015509015e-05, + "loss": 0.0038, + "step": 3900 + }, + { + "epoch": 5.953630378780651, + "eval_loss": 0.0737406462430954, + "eval_runtime": 159.5093, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 7.304, + "step": 3900 + }, + { + "epoch": 5.968896097700601, + "grad_norm": 0.10399655252695084, + "learning_rate": 4.1842902986235776e-05, + "loss": 0.0074, + "step": 3910 + }, + { + "epoch": 5.984161816620551, + "grad_norm": 0.2254919856786728, + "learning_rate": 4.1580127485742406e-05, + "loss": 0.0059, + "step": 3920 + }, + { + "epoch": 5.999427535540502, + "grad_norm": 0.18833018839359283, + "learning_rate": 4.131759111665349e-05, + "loss": 0.0131, + "step": 3930 + }, + { + "epoch": 6.014693254460452, + "grad_norm": 0.07881102710962296, + "learning_rate": 4.1055301335220955e-05, + "loss": 0.004, + "step": 3940 + }, + { + "epoch": 6.029958973380403, + "grad_norm": 0.050173331052064896, + "learning_rate": 4.079326559069343e-05, + "loss": 0.001, + "step": 3950 + }, + { + "epoch": 6.029958973380403, + "eval_loss": 0.06052103266119957, + "eval_runtime": 159.6128, + "eval_samples_per_second": 7.299, + "eval_steps_per_second": 7.299, + "step": 3950 + }, + { + "epoch": 6.045224692300353, + "grad_norm": 0.040920376777648926, + "learning_rate": 4.0531491325104715e-05, + "loss": 0.0032, + "step": 3960 + }, + { + "epoch": 6.060490411220304, + "grad_norm": 0.016376454383134842, + "learning_rate": 4.026998597306233e-05, + "loss": 0.0012, + "step": 3970 + }, + { + "epoch": 6.075756130140253, + "grad_norm": 0.032851431518793106, + "learning_rate": 4.000875696153644e-05, + "loss": 0.001, + "step": 3980 + }, + { + "epoch": 6.091021849060204, + "grad_norm": 0.19362133741378784, + "learning_rate": 3.9747811709648976e-05, + "loss": 0.005, + "step": 3990 + }, + { + "epoch": 6.1062875679801545, + "grad_norm": 0.017209986224770546, + "learning_rate": 3.948715762846278e-05, + "loss": 0.0022, + "step": 4000 + }, + { + "epoch": 6.1062875679801545, + "eval_loss": 0.07027663290500641, + "eval_runtime": 159.6335, + "eval_samples_per_second": 7.298, + "eval_steps_per_second": 7.298, + "step": 4000 + }, + { + "epoch": 6.121553286900105, + "grad_norm": 0.04576193168759346, + "learning_rate": 3.922680212077125e-05, + "loss": 0.0036, + "step": 4010 + }, + { + "epoch": 6.136819005820056, + "grad_norm": 0.2979854643344879, + "learning_rate": 3.896675258088801e-05, + "loss": 0.0005, + "step": 4020 + }, + { + "epoch": 6.152084724740006, + "grad_norm": 0.02651374787092209, + "learning_rate": 3.870701639443698e-05, + "loss": 0.0034, + "step": 4030 + }, + { + "epoch": 6.167350443659956, + "grad_norm": 0.15189190208911896, + "learning_rate": 3.844760093814255e-05, + "loss": 0.003, + "step": 4040 + }, + { + "epoch": 6.182616162579906, + "grad_norm": 0.687329888343811, + "learning_rate": 3.818851357962013e-05, + "loss": 0.0015, + "step": 4050 + }, + { + "epoch": 6.182616162579906, + "eval_loss": 0.07441620528697968, + "eval_runtime": 159.6434, + "eval_samples_per_second": 7.298, + "eval_steps_per_second": 7.298, + "step": 4050 + }, + { + "epoch": 6.197881881499857, + "grad_norm": 0.3071117401123047, + "learning_rate": 3.792976167716685e-05, + "loss": 0.002, + "step": 4060 + }, + { + "epoch": 6.213147600419807, + "grad_norm": 0.15328466892242432, + "learning_rate": 3.767135257955265e-05, + "loss": 0.0014, + "step": 4070 + }, + { + "epoch": 6.228413319339758, + "grad_norm": 0.10137467086315155, + "learning_rate": 3.74132936258115e-05, + "loss": 0.0054, + "step": 4080 + }, + { + "epoch": 6.2436790382597085, + "grad_norm": 0.3588423728942871, + "learning_rate": 3.715559214503298e-05, + "loss": 0.0024, + "step": 4090 + }, + { + "epoch": 6.258944757179658, + "grad_norm": 0.02178932912647724, + "learning_rate": 3.6898255456154194e-05, + "loss": 0.003, + "step": 4100 + }, + { + "epoch": 6.258944757179658, + "eval_loss": 0.07465291023254395, + "eval_runtime": 159.6737, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4100 + }, + { + "epoch": 6.274210476099609, + "grad_norm": 0.006120866630226374, + "learning_rate": 3.664129086775176e-05, + "loss": 0.002, + "step": 4110 + }, + { + "epoch": 6.289476195019559, + "grad_norm": 0.05380964279174805, + "learning_rate": 3.638470567783442e-05, + "loss": 0.0041, + "step": 4120 + }, + { + "epoch": 6.30474191393951, + "grad_norm": 0.6024661064147949, + "learning_rate": 3.612850717363563e-05, + "loss": 0.0023, + "step": 4130 + }, + { + "epoch": 6.32000763285946, + "grad_norm": 0.6831842660903931, + "learning_rate": 3.5872702631406675e-05, + "loss": 0.0029, + "step": 4140 + }, + { + "epoch": 6.33527335177941, + "grad_norm": 0.01892218552529812, + "learning_rate": 3.561729931620998e-05, + "loss": 0.0022, + "step": 4150 + }, + { + "epoch": 6.33527335177941, + "eval_loss": 0.06904370337724686, + "eval_runtime": 159.6719, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4150 + }, + { + "epoch": 6.3505390706993605, + "grad_norm": 0.016175104305148125, + "learning_rate": 3.5362304481712803e-05, + "loss": 0.0011, + "step": 4160 + }, + { + "epoch": 6.365804789619311, + "grad_norm": 0.02106320485472679, + "learning_rate": 3.5107725369981184e-05, + "loss": 0.0069, + "step": 4170 + }, + { + "epoch": 6.381070508539262, + "grad_norm": 0.07916820049285889, + "learning_rate": 3.4853569211274306e-05, + "loss": 0.0018, + "step": 4180 + }, + { + "epoch": 6.396336227459212, + "grad_norm": 0.18828001618385315, + "learning_rate": 3.459984322383913e-05, + "loss": 0.0073, + "step": 4190 + }, + { + "epoch": 6.411601946379163, + "grad_norm": 0.6668338775634766, + "learning_rate": 3.434655461370538e-05, + "loss": 0.0055, + "step": 4200 + }, + { + "epoch": 6.411601946379163, + "eval_loss": 0.06506691128015518, + "eval_runtime": 159.5993, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 7.3, + "step": 4200 + }, + { + "epoch": 6.426867665299112, + "grad_norm": 0.11451531946659088, + "learning_rate": 3.409371057448092e-05, + "loss": 0.0008, + "step": 4210 + }, + { + "epoch": 6.442133384219063, + "grad_norm": 0.042590703815221786, + "learning_rate": 3.384131828714743e-05, + "loss": 0.0027, + "step": 4220 + }, + { + "epoch": 6.457399103139013, + "grad_norm": 0.010321835987269878, + "learning_rate": 3.3589384919856446e-05, + "loss": 0.0017, + "step": 4230 + }, + { + "epoch": 6.472664822058964, + "grad_norm": 0.0195904690772295, + "learning_rate": 3.33379176277258e-05, + "loss": 0.002, + "step": 4240 + }, + { + "epoch": 6.4879305409789145, + "grad_norm": 0.03529507294297218, + "learning_rate": 3.3086923552636414e-05, + "loss": 0.0039, + "step": 4250 + }, + { + "epoch": 6.4879305409789145, + "eval_loss": 0.06506752222776413, + "eval_runtime": 159.6638, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 4250 + }, + { + "epoch": 6.503196259898864, + "grad_norm": 0.014751738868653774, + "learning_rate": 3.28364098230294e-05, + "loss": 0.0025, + "step": 4260 + }, + { + "epoch": 6.518461978818815, + "grad_norm": 0.21436801552772522, + "learning_rate": 3.258638355370372e-05, + "loss": 0.0053, + "step": 4270 + }, + { + "epoch": 6.533727697738765, + "grad_norm": 0.20733974874019623, + "learning_rate": 3.2336851845614016e-05, + "loss": 0.002, + "step": 4280 + }, + { + "epoch": 6.548993416658716, + "grad_norm": 0.19165655970573425, + "learning_rate": 3.2087821785669e-05, + "loss": 0.0051, + "step": 4290 + }, + { + "epoch": 6.564259135578666, + "grad_norm": 0.6599664092063904, + "learning_rate": 3.183930044653014e-05, + "loss": 0.0038, + "step": 4300 + }, + { + "epoch": 6.564259135578666, + "eval_loss": 0.06381241232156754, + "eval_runtime": 159.6611, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 4300 + }, + { + "epoch": 6.579524854498617, + "grad_norm": 0.2495010942220688, + "learning_rate": 3.159129488641084e-05, + "loss": 0.0022, + "step": 4310 + }, + { + "epoch": 6.594790573418567, + "grad_norm": 0.04184914380311966, + "learning_rate": 3.1343812148875936e-05, + "loss": 0.0014, + "step": 4320 + }, + { + "epoch": 6.610056292338517, + "grad_norm": 0.01129506528377533, + "learning_rate": 3.109685926264161e-05, + "loss": 0.0013, + "step": 4330 + }, + { + "epoch": 6.625322011258468, + "grad_norm": 0.16454286873340607, + "learning_rate": 3.085044324137591e-05, + "loss": 0.0022, + "step": 4340 + }, + { + "epoch": 6.640587730178418, + "grad_norm": 0.19312164187431335, + "learning_rate": 3.060457108349943e-05, + "loss": 0.0009, + "step": 4350 + }, + { + "epoch": 6.640587730178418, + "eval_loss": 0.06959080696105957, + "eval_runtime": 159.6654, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 4350 + }, + { + "epoch": 6.655853449098369, + "grad_norm": 0.012085399590432644, + "learning_rate": 3.0359249771986604e-05, + "loss": 0.0022, + "step": 4360 + }, + { + "epoch": 6.671119168018318, + "grad_norm": 0.03179095312952995, + "learning_rate": 3.0114486274167343e-05, + "loss": 0.0008, + "step": 4370 + }, + { + "epoch": 6.686384886938269, + "grad_norm": 0.011703737080097198, + "learning_rate": 2.9870287541529202e-05, + "loss": 0.0032, + "step": 4380 + }, + { + "epoch": 6.7016506058582195, + "grad_norm": 0.008592026308178902, + "learning_rate": 2.962666050951997e-05, + "loss": 0.0031, + "step": 4390 + }, + { + "epoch": 6.71691632477817, + "grad_norm": 0.08471984416246414, + "learning_rate": 2.9383612097350578e-05, + "loss": 0.0032, + "step": 4400 + }, + { + "epoch": 6.71691632477817, + "eval_loss": 0.07111377269029617, + "eval_runtime": 159.6389, + "eval_samples_per_second": 7.298, + "eval_steps_per_second": 7.298, + "step": 4400 + }, + { + "epoch": 6.732182043698121, + "grad_norm": 0.3381322920322418, + "learning_rate": 2.914114920779872e-05, + "loss": 0.0021, + "step": 4410 + }, + { + "epoch": 6.747447762618071, + "grad_norm": 0.08128316700458527, + "learning_rate": 2.889927872701278e-05, + "loss": 0.0024, + "step": 4420 + }, + { + "epoch": 6.762713481538022, + "grad_norm": 0.020396780222654343, + "learning_rate": 2.865800752431621e-05, + "loss": 0.0012, + "step": 4430 + }, + { + "epoch": 6.777979200457971, + "grad_norm": 0.30782634019851685, + "learning_rate": 2.8417342452012462e-05, + "loss": 0.0046, + "step": 4440 + }, + { + "epoch": 6.793244919377922, + "grad_norm": 0.025567952543497086, + "learning_rate": 2.8177290345190386e-05, + "loss": 0.001, + "step": 4450 + }, + { + "epoch": 6.793244919377922, + "eval_loss": 0.0718027651309967, + "eval_runtime": 159.6867, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4450 + }, + { + "epoch": 6.808510638297872, + "grad_norm": 0.04894142970442772, + "learning_rate": 2.7937858021530104e-05, + "loss": 0.0023, + "step": 4460 + }, + { + "epoch": 6.823776357217823, + "grad_norm": 0.07794681191444397, + "learning_rate": 2.7699052281109384e-05, + "loss": 0.0034, + "step": 4470 + }, + { + "epoch": 6.8390420761377735, + "grad_norm": 0.11947426944971085, + "learning_rate": 2.7460879906210487e-05, + "loss": 0.0011, + "step": 4480 + }, + { + "epoch": 6.854307795057723, + "grad_norm": 0.04827739670872688, + "learning_rate": 2.7223347661127578e-05, + "loss": 0.002, + "step": 4490 + }, + { + "epoch": 6.869573513977674, + "grad_norm": 0.011265079490840435, + "learning_rate": 2.698646229197461e-05, + "loss": 0.0024, + "step": 4500 + }, + { + "epoch": 6.869573513977674, + "eval_loss": 0.07399994134902954, + "eval_runtime": 159.6474, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 4500 + }, + { + "epoch": 6.884839232897624, + "grad_norm": 0.25395667552948, + "learning_rate": 2.6750230526493703e-05, + "loss": 0.0044, + "step": 4510 + }, + { + "epoch": 6.900104951817575, + "grad_norm": 0.3501344919204712, + "learning_rate": 2.6514659073864102e-05, + "loss": 0.0028, + "step": 4520 + }, + { + "epoch": 6.915370670737525, + "grad_norm": 0.04269612953066826, + "learning_rate": 2.6279754624511598e-05, + "loss": 0.0008, + "step": 4530 + }, + { + "epoch": 6.930636389657476, + "grad_norm": 0.4863770604133606, + "learning_rate": 2.604552384991855e-05, + "loss": 0.0029, + "step": 4540 + }, + { + "epoch": 6.9459021085774255, + "grad_norm": 0.34951895475387573, + "learning_rate": 2.5811973402434385e-05, + "loss": 0.0014, + "step": 4550 + }, + { + "epoch": 6.9459021085774255, + "eval_loss": 0.07328890264034271, + "eval_runtime": 159.6704, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4550 + }, + { + "epoch": 6.961167827497376, + "grad_norm": 0.3526895046234131, + "learning_rate": 2.557910991508667e-05, + "loss": 0.005, + "step": 4560 + }, + { + "epoch": 6.976433546417327, + "grad_norm": 0.10379170626401901, + "learning_rate": 2.5346940001392728e-05, + "loss": 0.0019, + "step": 4570 + }, + { + "epoch": 6.991699265337277, + "grad_norm": 0.11917455494403839, + "learning_rate": 2.5115470255171835e-05, + "loss": 0.0022, + "step": 4580 + }, + { + "epoch": 7.006964984257228, + "grad_norm": 0.009902575984597206, + "learning_rate": 2.4884707250357886e-05, + "loss": 0.0033, + "step": 4590 + }, + { + "epoch": 7.022230703177177, + "grad_norm": 0.014623602852225304, + "learning_rate": 2.465465754081277e-05, + "loss": 0.0017, + "step": 4600 + }, + { + "epoch": 7.022230703177177, + "eval_loss": 0.06816408783197403, + "eval_runtime": 159.672, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4600 + }, + { + "epoch": 7.037496422097128, + "grad_norm": 0.04369521886110306, + "learning_rate": 2.442532766014019e-05, + "loss": 0.0006, + "step": 4610 + }, + { + "epoch": 7.052762141017078, + "grad_norm": 0.012834345921874046, + "learning_rate": 2.4196724121500043e-05, + "loss": 0.0003, + "step": 4620 + }, + { + "epoch": 7.068027859937029, + "grad_norm": 0.004107500892132521, + "learning_rate": 2.3968853417423608e-05, + "loss": 0.0004, + "step": 4630 + }, + { + "epoch": 7.0832935788569795, + "grad_norm": 0.06816854327917099, + "learning_rate": 2.3741722019629e-05, + "loss": 0.0007, + "step": 4640 + }, + { + "epoch": 7.09855929777693, + "grad_norm": 0.009550396353006363, + "learning_rate": 2.3515336378837403e-05, + "loss": 0.0009, + "step": 4650 + }, + { + "epoch": 7.09855929777693, + "eval_loss": 0.07572363317012787, + "eval_runtime": 159.66, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 7.297, + "step": 4650 + }, + { + "epoch": 7.11382501669688, + "grad_norm": 0.0069396113976836205, + "learning_rate": 2.3289702924589914e-05, + "loss": 0.0005, + "step": 4660 + }, + { + "epoch": 7.12909073561683, + "grad_norm": 0.2257951945066452, + "learning_rate": 2.3064828065064876e-05, + "loss": 0.0027, + "step": 4670 + }, + { + "epoch": 7.144356454536781, + "grad_norm": 0.06146746873855591, + "learning_rate": 2.284071818689593e-05, + "loss": 0.0013, + "step": 4680 + }, + { + "epoch": 7.159622173456731, + "grad_norm": 0.02538224682211876, + "learning_rate": 2.2617379654990623e-05, + "loss": 0.0007, + "step": 4690 + }, + { + "epoch": 7.174887892376682, + "grad_norm": 0.04200801998376846, + "learning_rate": 2.2394818812349534e-05, + "loss": 0.0016, + "step": 4700 + }, + { + "epoch": 7.174887892376682, + "eval_loss": 0.07773684710264206, + "eval_runtime": 159.639, + "eval_samples_per_second": 7.298, + "eval_steps_per_second": 7.298, + "step": 4700 + }, + { + "epoch": 7.1901536112966316, + "grad_norm": 0.011373401619493961, + "learning_rate": 2.217304197988635e-05, + "loss": 0.0006, + "step": 4710 + }, + { + "epoch": 7.205419330216582, + "grad_norm": 1.122536540031433, + "learning_rate": 2.1952055456248127e-05, + "loss": 0.0031, + "step": 4720 + }, + { + "epoch": 7.220685049136533, + "grad_norm": 0.013550849631428719, + "learning_rate": 2.1731865517636517e-05, + "loss": 0.0004, + "step": 4730 + }, + { + "epoch": 7.235950768056483, + "grad_norm": 0.009380185976624489, + "learning_rate": 2.1512478417629494e-05, + "loss": 0.0007, + "step": 4740 + }, + { + "epoch": 7.251216486976434, + "grad_norm": 0.0018232482252642512, + "learning_rate": 2.129390038700374e-05, + "loss": 0.0004, + "step": 4750 + }, + { + "epoch": 7.251216486976434, + "eval_loss": 0.07988452911376953, + "eval_runtime": 159.7358, + "eval_samples_per_second": 7.293, + "eval_steps_per_second": 7.293, + "step": 4750 + }, + { + "epoch": 7.266482205896384, + "grad_norm": 0.008628422394394875, + "learning_rate": 2.1076137633557708e-05, + "loss": 0.0003, + "step": 4760 + }, + { + "epoch": 7.281747924816334, + "grad_norm": 0.13268333673477173, + "learning_rate": 2.085919634193525e-05, + "loss": 0.0003, + "step": 4770 + }, + { + "epoch": 7.2970136437362845, + "grad_norm": 0.0029896984342485666, + "learning_rate": 2.0643082673450055e-05, + "loss": 0.0007, + "step": 4780 + }, + { + "epoch": 7.312279362656235, + "grad_norm": 0.05689848214387894, + "learning_rate": 2.0427802765910647e-05, + "loss": 0.0003, + "step": 4790 + }, + { + "epoch": 7.327545081576186, + "grad_norm": 0.3269273638725281, + "learning_rate": 2.0213362733446013e-05, + "loss": 0.0019, + "step": 4800 + }, + { + "epoch": 7.327545081576186, + "eval_loss": 0.08388219773769379, + "eval_runtime": 159.6702, + "eval_samples_per_second": 7.296, + "eval_steps_per_second": 7.296, + "step": 4800 + }, + { + "epoch": 7.342810800496136, + "grad_norm": 0.002009670017287135, + "learning_rate": 1.9999768666331973e-05, + "loss": 0.0005, + "step": 4810 + }, + { + "epoch": 7.358076519416087, + "grad_norm": 0.009270197711884975, + "learning_rate": 1.9787026630818267e-05, + "loss": 0.0001, + "step": 4820 + }, + { + "epoch": 7.373342238336036, + "grad_norm": 0.042061783373355865, + "learning_rate": 1.9575142668956203e-05, + "loss": 0.0007, + "step": 4830 + }, + { + "epoch": 7.388607957255987, + "grad_norm": 0.07137414813041687, + "learning_rate": 1.936412279842705e-05, + "loss": 0.0006, + "step": 4840 + }, + { + "epoch": 7.403873676175937, + "grad_norm": 0.016255928203463554, + "learning_rate": 1.9153973012371214e-05, + "loss": 0.0003, + "step": 4850 + }, + { + "epoch": 7.403873676175937, + "eval_loss": 0.08946722000837326, + "eval_runtime": 159.7468, + "eval_samples_per_second": 7.293, + "eval_steps_per_second": 7.293, + "step": 4850 + }, + { + "epoch": 7.419139395095888, + "grad_norm": 0.022102078422904015, + "learning_rate": 1.8944699279217935e-05, + "loss": 0.0004, + "step": 4860 + }, + { + "epoch": 7.4344051140158385, + "grad_norm": 0.004178919829428196, + "learning_rate": 1.873630754251588e-05, + "loss": 0.0002, + "step": 4870 + }, + { + "epoch": 7.449670832935789, + "grad_norm": 0.005114271305501461, + "learning_rate": 1.852880372076422e-05, + "loss": 0.0003, + "step": 4880 + }, + { + "epoch": 7.464936551855739, + "grad_norm": 0.00423672329634428, + "learning_rate": 1.8322193707244623e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 7.480202270775689, + "grad_norm": 0.001411727978847921, + "learning_rate": 1.8116483369853855e-05, + "loss": 0.0001, + "step": 4900 + }, + { + "epoch": 7.480202270775689, + "eval_loss": 0.09254661947488785, + "eval_runtime": 159.8031, + "eval_samples_per_second": 7.29, + "eval_steps_per_second": 7.29, + "step": 4900 + }, + { + "epoch": 7.49546798969564, + "grad_norm": 0.11618539690971375, + "learning_rate": 1.791167855093715e-05, + "loss": 0.0048, + "step": 4910 + }, + { + "epoch": 7.51073370861559, + "grad_norm": 0.009215681813657284, + "learning_rate": 1.7707785067122207e-05, + "loss": 0.0004, + "step": 4920 + }, + { + "epoch": 7.525999427535541, + "grad_norm": 0.006266661919653416, + "learning_rate": 1.7504808709154104e-05, + "loss": 0.0011, + "step": 4930 + }, + { + "epoch": 7.5412651464554905, + "grad_norm": 0.030067048966884613, + "learning_rate": 1.7302755241730735e-05, + "loss": 0.0001, + "step": 4940 + }, + { + "epoch": 7.556530865375441, + "grad_norm": 0.005040339194238186, + "learning_rate": 1.710163040333919e-05, + "loss": 0.0004, + "step": 4950 + }, + { + "epoch": 7.556530865375441, + "eval_loss": 0.08718981593847275, + "eval_runtime": 159.802, + "eval_samples_per_second": 7.29, + "eval_steps_per_second": 7.29, + "step": 4950 + }, + { + "epoch": 7.571796584295392, + "grad_norm": 0.16111654043197632, + "learning_rate": 1.6901439906092683e-05, + "loss": 0.0022, + "step": 4960 + }, + { + "epoch": 7.587062303215342, + "grad_norm": 0.010601126588881016, + "learning_rate": 1.6702189435568344e-05, + "loss": 0.0001, + "step": 4970 + }, + { + "epoch": 7.602328022135293, + "grad_norm": 0.018727615475654602, + "learning_rate": 1.6503884650645806e-05, + "loss": 0.0005, + "step": 4980 + }, + { + "epoch": 7.617593741055243, + "grad_norm": 0.029771562665700912, + "learning_rate": 1.6306531183346385e-05, + "loss": 0.0002, + "step": 4990 + }, + { + "epoch": 7.632859459975193, + "grad_norm": 0.006750907748937607, + "learning_rate": 1.611013463867322e-05, + "loss": 0.0005, + "step": 5000 + }, + { + "epoch": 7.632859459975193, + "eval_loss": 0.0858391746878624, + "eval_runtime": 159.7806, + "eval_samples_per_second": 7.291, + "eval_steps_per_second": 7.291, + "step": 5000 + }, + { + "epoch": 7.648125178895143, + "grad_norm": 0.02183113619685173, + "learning_rate": 1.5914700594452048e-05, + "loss": 0.0014, + "step": 5010 + }, + { + "epoch": 7.663390897815094, + "grad_norm": 0.01728837378323078, + "learning_rate": 1.5720234601172766e-05, + "loss": 0.0002, + "step": 5020 + }, + { + "epoch": 7.6786566167350445, + "grad_norm": 0.009235146455466747, + "learning_rate": 1.5526742181831865e-05, + "loss": 0.0001, + "step": 5030 + }, + { + "epoch": 7.693922335654995, + "grad_norm": 0.4782828688621521, + "learning_rate": 1.5334228831775478e-05, + "loss": 0.002, + "step": 5040 + }, + { + "epoch": 7.709188054574945, + "grad_norm": 0.004086898639798164, + "learning_rate": 1.5142700018543382e-05, + "loss": 0.0007, + "step": 5050 + }, + { + "epoch": 7.709188054574945, + "eval_loss": 0.08867139369249344, + "eval_runtime": 159.7519, + "eval_samples_per_second": 7.293, + "eval_steps_per_second": 7.293, + "step": 5050 + }, + { + "epoch": 7.724453773494895, + "grad_norm": 0.1089015081524849, + "learning_rate": 1.4952161181713647e-05, + "loss": 0.0017, + "step": 5060 + }, + { + "epoch": 7.739719492414846, + "grad_norm": 0.036126453429460526, + "learning_rate": 1.4762617732748224e-05, + "loss": 0.0025, + "step": 5070 + }, + { + "epoch": 7.754985211334796, + "grad_norm": 0.05139140039682388, + "learning_rate": 1.4574075054839208e-05, + "loss": 0.0014, + "step": 5080 + }, + { + "epoch": 7.770250930254747, + "grad_norm": 0.888748824596405, + "learning_rate": 1.438653850275597e-05, + "loss": 0.0019, + "step": 5090 + }, + { + "epoch": 7.785516649174697, + "grad_norm": 0.039054010063409805, + "learning_rate": 1.420001340269303e-05, + "loss": 0.0002, + "step": 5100 + }, + { + "epoch": 7.785516649174697, + "eval_loss": 0.0848807543516159, + "eval_runtime": 159.7803, + "eval_samples_per_second": 7.291, + "eval_steps_per_second": 7.291, + "step": 5100 + }, + { + "epoch": 7.800782368094647, + "grad_norm": 0.0031700143590569496, + "learning_rate": 1.4014505052118892e-05, + "loss": 0.0002, + "step": 5110 + }, + { + "epoch": 7.816048087014598, + "grad_norm": 0.18561773002147675, + "learning_rate": 1.383001871962547e-05, + "loss": 0.0024, + "step": 5120 + }, + { + "epoch": 7.831313805934548, + "grad_norm": 0.31953901052474976, + "learning_rate": 1.3646559644778556e-05, + "loss": 0.0027, + "step": 5130 + }, + { + "epoch": 7.846579524854499, + "grad_norm": 0.00383644993416965, + "learning_rate": 1.3464133037968912e-05, + "loss": 0.0008, + "step": 5140 + }, + { + "epoch": 7.861845243774449, + "grad_norm": 0.21882972121238708, + "learning_rate": 1.328274408026438e-05, + "loss": 0.0007, + "step": 5150 + }, + { + "epoch": 7.861845243774449, + "eval_loss": 0.08296097815036774, + "eval_runtime": 159.8031, + "eval_samples_per_second": 7.29, + "eval_steps_per_second": 7.29, + "step": 5150 + }, + { + "epoch": 7.877110962694399, + "grad_norm": 0.004578197840601206, + "learning_rate": 1.3102397923262711e-05, + "loss": 0.0004, + "step": 5160 + }, + { + "epoch": 7.8923766816143495, + "grad_norm": 0.00605930807068944, + "learning_rate": 1.2923099688945234e-05, + "loss": 0.0005, + "step": 5170 + }, + { + "epoch": 7.9076424005343, + "grad_norm": 0.00446905056014657, + "learning_rate": 1.2744854469531376e-05, + "loss": 0.0014, + "step": 5180 + }, + { + "epoch": 7.9229081194542506, + "grad_norm": 0.016885535791516304, + "learning_rate": 1.2567667327334125e-05, + "loss": 0.0007, + "step": 5190 + }, + { + "epoch": 7.938173838374201, + "grad_norm": 0.011587091721594334, + "learning_rate": 1.2391543294616147e-05, + "loss": 0.0016, + "step": 5200 + }, + { + "epoch": 7.938173838374201, + "eval_loss": 0.08485978096723557, + "eval_runtime": 159.8423, + "eval_samples_per_second": 7.288, + "eval_steps_per_second": 7.288, + "step": 5200 + }, + { + "epoch": 7.953439557294152, + "grad_norm": 0.005680097732692957, + "learning_rate": 1.2216487373446894e-05, + "loss": 0.0001, + "step": 5210 + }, + { + "epoch": 7.968705276214102, + "grad_norm": 0.013168740086257458, + "learning_rate": 1.20425045355606e-05, + "loss": 0.0002, + "step": 5220 + }, + { + "epoch": 7.983970995134052, + "grad_norm": 0.008901200257241726, + "learning_rate": 1.1869599722215014e-05, + "loss": 0.0013, + "step": 5230 + }, + { + "epoch": 7.999236714054002, + "grad_norm": 0.0027007104363292456, + "learning_rate": 1.1697777844051105e-05, + "loss": 0.0006, + "step": 5240 + }, + { + "epoch": 8.014502432973952, + "grad_norm": 0.010568113997578621, + "learning_rate": 1.1527043780953567e-05, + "loss": 0.0003, + "step": 5250 + }, + { + "epoch": 8.014502432973952, + "eval_loss": 0.08593782782554626, + "eval_runtime": 160.0116, + "eval_samples_per_second": 7.281, + "eval_steps_per_second": 7.281, + "step": 5250 + }, + { + "epoch": 8.029768151893903, + "grad_norm": 0.009893224574625492, + "learning_rate": 1.1357402381912224e-05, + "loss": 0.0, + "step": 5260 + }, + { + "epoch": 8.045033870813853, + "grad_norm": 0.000797534710727632, + "learning_rate": 1.11888584648844e-05, + "loss": 0.0002, + "step": 5270 + }, + { + "epoch": 8.060299589733804, + "grad_norm": 0.008847147226333618, + "learning_rate": 1.1021416816657915e-05, + "loss": 0.0003, + "step": 5280 + }, + { + "epoch": 8.075565308653754, + "grad_norm": 0.003816022304818034, + "learning_rate": 1.0855082192715294e-05, + "loss": 0.0001, + "step": 5290 + }, + { + "epoch": 8.090831027573705, + "grad_norm": 0.016444409266114235, + "learning_rate": 1.0689859317098639e-05, + "loss": 0.0001, + "step": 5300 + }, + { + "epoch": 8.090831027573705, + "eval_loss": 0.08883290737867355, + "eval_runtime": 159.9823, + "eval_samples_per_second": 7.282, + "eval_steps_per_second": 7.282, + "step": 5300 + }, + { + "epoch": 8.106096746493655, + "grad_norm": 0.0040445211343467236, + "learning_rate": 1.052575288227547e-05, + "loss": 0.001, + "step": 5310 + }, + { + "epoch": 8.121362465413606, + "grad_norm": 0.0623970627784729, + "learning_rate": 1.0362767549005453e-05, + "loss": 0.0007, + "step": 5320 + }, + { + "epoch": 8.136628184333556, + "grad_norm": 0.010912500321865082, + "learning_rate": 1.0200907946208049e-05, + "loss": 0.0006, + "step": 5330 + }, + { + "epoch": 8.151893903253507, + "grad_norm": 0.0010389735689386725, + "learning_rate": 1.0040178670830996e-05, + "loss": 0.0001, + "step": 5340 + }, + { + "epoch": 8.167159622173457, + "grad_norm": 0.007230148650705814, + "learning_rate": 9.880584287719868e-06, + "loss": 0.0007, + "step": 5350 + }, + { + "epoch": 8.167159622173457, + "eval_loss": 0.08856642246246338, + "eval_runtime": 160.1118, + "eval_samples_per_second": 7.276, + "eval_steps_per_second": 7.276, + "step": 5350 + }, + { + "epoch": 8.182425341093406, + "grad_norm": 0.007614819798618555, + "learning_rate": 9.722129329488255e-06, + "loss": 0.0005, + "step": 5360 + }, + { + "epoch": 8.197691060013357, + "grad_norm": 0.038253843784332275, + "learning_rate": 9.5648182963892e-06, + "loss": 0.0002, + "step": 5370 + }, + { + "epoch": 8.212956778933307, + "grad_norm": 0.019887778908014297, + "learning_rate": 9.408655656187282e-06, + "loss": 0.0002, + "step": 5380 + }, + { + "epoch": 8.228222497853258, + "grad_norm": 0.0055953869596123695, + "learning_rate": 9.253645844031783e-06, + "loss": 0.0001, + "step": 5390 + }, + { + "epoch": 8.243488216773208, + "grad_norm": 0.04061136767268181, + "learning_rate": 9.099793262330698e-06, + "loss": 0.0003, + "step": 5400 + }, + { + "epoch": 8.243488216773208, + "eval_loss": 0.0913085788488388, + "eval_runtime": 160.1438, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 7.275, + "step": 5400 + }, + { + "epoch": 8.258753935693159, + "grad_norm": 0.0008719676407054067, + "learning_rate": 8.947102280625708e-06, + "loss": 0.0007, + "step": 5410 + }, + { + "epoch": 8.27401965461311, + "grad_norm": 0.0020250342786312103, + "learning_rate": 8.7955772354681e-06, + "loss": 0.0001, + "step": 5420 + }, + { + "epoch": 8.28928537353306, + "grad_norm": 0.0006268220022320747, + "learning_rate": 8.645222430295568e-06, + "loss": 0.0001, + "step": 5430 + }, + { + "epoch": 8.30455109245301, + "grad_norm": 0.00205238233320415, + "learning_rate": 8.49604213531004e-06, + "loss": 0.0001, + "step": 5440 + }, + { + "epoch": 8.319816811372961, + "grad_norm": 0.00863594003021717, + "learning_rate": 8.348040587356349e-06, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 8.319816811372961, + "eval_loss": 0.09238555282354355, + "eval_runtime": 160.06, + "eval_samples_per_second": 7.279, + "eval_steps_per_second": 7.279, + "step": 5450 + }, + { + "epoch": 8.335082530292912, + "grad_norm": 0.11376556754112244, + "learning_rate": 8.20122198980195e-06, + "loss": 0.0004, + "step": 5460 + }, + { + "epoch": 8.350348249212862, + "grad_norm": 0.0025415513664484024, + "learning_rate": 8.0555905124175e-06, + "loss": 0.0003, + "step": 5470 + }, + { + "epoch": 8.365613968132811, + "grad_norm": 0.0032609249465167522, + "learning_rate": 7.911150291258474e-06, + "loss": 0.0001, + "step": 5480 + }, + { + "epoch": 8.380879687052762, + "grad_norm": 0.005834829993546009, + "learning_rate": 7.767905428547655e-06, + "loss": 0.0001, + "step": 5490 + }, + { + "epoch": 8.396145405972712, + "grad_norm": 0.3525996804237366, + "learning_rate": 7.625859992558665e-06, + "loss": 0.0008, + "step": 5500 + }, + { + "epoch": 8.396145405972712, + "eval_loss": 0.09373430907726288, + "eval_runtime": 160.207, + "eval_samples_per_second": 7.272, + "eval_steps_per_second": 7.272, + "step": 5500 + }, + { + "epoch": 8.411411124892663, + "grad_norm": 0.0033581305760890245, + "learning_rate": 7.485018017500406e-06, + "loss": 0.0, + "step": 5510 + }, + { + "epoch": 8.426676843812613, + "grad_norm": 0.00323873502202332, + "learning_rate": 7.34538350340247e-06, + "loss": 0.0001, + "step": 5520 + }, + { + "epoch": 8.441942562732564, + "grad_norm": 0.0015864345477893949, + "learning_rate": 7.206960416001562e-06, + "loss": 0.0006, + "step": 5530 + }, + { + "epoch": 8.457208281652514, + "grad_norm": 0.057617250829935074, + "learning_rate": 7.069752686628861e-06, + "loss": 0.0001, + "step": 5540 + }, + { + "epoch": 8.472474000572465, + "grad_norm": 0.001198368612676859, + "learning_rate": 6.933764212098354e-06, + "loss": 0.0001, + "step": 5550 + }, + { + "epoch": 8.472474000572465, + "eval_loss": 0.09484256058931351, + "eval_runtime": 160.3128, + "eval_samples_per_second": 7.267, + "eval_steps_per_second": 7.267, + "step": 5550 + }, + { + "epoch": 8.487739719492415, + "grad_norm": 0.002751422580331564, + "learning_rate": 6.798998854596189e-06, + "loss": 0.0003, + "step": 5560 + }, + { + "epoch": 8.503005438412366, + "grad_norm": 0.0015565926441922784, + "learning_rate": 6.6654604415709425e-06, + "loss": 0.001, + "step": 5570 + }, + { + "epoch": 8.518271157332316, + "grad_norm": 0.0031182761304080486, + "learning_rate": 6.533152765624978e-06, + "loss": 0.0018, + "step": 5580 + }, + { + "epoch": 8.533536876252265, + "grad_norm": 0.0013917662436142564, + "learning_rate": 6.402079584406673e-06, + "loss": 0.0005, + "step": 5590 + }, + { + "epoch": 8.548802595172216, + "grad_norm": 0.00270106946118176, + "learning_rate": 6.272244620503731e-06, + "loss": 0.0001, + "step": 5600 + }, + { + "epoch": 8.548802595172216, + "eval_loss": 0.09488116204738617, + "eval_runtime": 160.2399, + "eval_samples_per_second": 7.27, + "eval_steps_per_second": 7.27, + "step": 5600 + }, + { + "epoch": 8.564068314092166, + "grad_norm": 0.0007605583523400128, + "learning_rate": 6.143651561337466e-06, + "loss": 0.0002, + "step": 5610 + }, + { + "epoch": 8.579334033012117, + "grad_norm": 0.027013279497623444, + "learning_rate": 6.016304059058031e-06, + "loss": 0.0003, + "step": 5620 + }, + { + "epoch": 8.594599751932067, + "grad_norm": 0.0034741011913865805, + "learning_rate": 5.890205730440762e-06, + "loss": 0.0008, + "step": 5630 + }, + { + "epoch": 8.609865470852018, + "grad_norm": 0.005648314021527767, + "learning_rate": 5.765360156783384e-06, + "loss": 0.0002, + "step": 5640 + }, + { + "epoch": 8.625131189771968, + "grad_norm": 0.040052514523267746, + "learning_rate": 5.641770883804365e-06, + "loss": 0.0005, + "step": 5650 + }, + { + "epoch": 8.625131189771968, + "eval_loss": 0.0964658185839653, + "eval_runtime": 160.1324, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 7.275, + "step": 5650 + }, + { + "epoch": 8.640396908691919, + "grad_norm": 0.011287623085081577, + "learning_rate": 5.519441421542165e-06, + "loss": 0.0001, + "step": 5660 + }, + { + "epoch": 8.65566262761187, + "grad_norm": 0.004145721439272165, + "learning_rate": 5.398375244255577e-06, + "loss": 0.0001, + "step": 5670 + }, + { + "epoch": 8.67092834653182, + "grad_norm": 0.004152486566454172, + "learning_rate": 5.2785757903250525e-06, + "loss": 0.0004, + "step": 5680 + }, + { + "epoch": 8.68619406545177, + "grad_norm": 0.007330935914069414, + "learning_rate": 5.160046462155032e-06, + "loss": 0.0003, + "step": 5690 + }, + { + "epoch": 8.701459784371721, + "grad_norm": 0.001574644003994763, + "learning_rate": 5.042790626077337e-06, + "loss": 0.0009, + "step": 5700 + }, + { + "epoch": 8.701459784371721, + "eval_loss": 0.09799868613481522, + "eval_runtime": 160.2043, + "eval_samples_per_second": 7.272, + "eval_steps_per_second": 7.272, + "step": 5700 + }, + { + "epoch": 8.71672550329167, + "grad_norm": 0.04094800353050232, + "learning_rate": 4.926811612255538e-06, + "loss": 0.0001, + "step": 5710 + }, + { + "epoch": 8.73199122221162, + "grad_norm": 0.1960475742816925, + "learning_rate": 4.812112714590383e-06, + "loss": 0.0017, + "step": 5720 + }, + { + "epoch": 8.747256941131571, + "grad_norm": 0.0014480681857094169, + "learning_rate": 4.698697190626278e-06, + "loss": 0.0001, + "step": 5730 + }, + { + "epoch": 8.762522660051522, + "grad_norm": 0.003530193818733096, + "learning_rate": 4.586568261458729e-06, + "loss": 0.0001, + "step": 5740 + }, + { + "epoch": 8.777788378971472, + "grad_norm": 0.003491970943287015, + "learning_rate": 4.475729111642873e-06, + "loss": 0.001, + "step": 5750 + }, + { + "epoch": 8.777788378971472, + "eval_loss": 0.09789612144231796, + "eval_runtime": 160.1368, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 7.275, + "step": 5750 + }, + { + "epoch": 8.793054097891423, + "grad_norm": 0.005949192680418491, + "learning_rate": 4.366182889103049e-06, + "loss": 0.0008, + "step": 5760 + }, + { + "epoch": 8.808319816811373, + "grad_norm": 0.014645606279373169, + "learning_rate": 4.257932705043371e-06, + "loss": 0.0009, + "step": 5770 + }, + { + "epoch": 8.823585535731324, + "grad_norm": 0.0013707492034882307, + "learning_rate": 4.150981633859391e-06, + "loss": 0.0, + "step": 5780 + }, + { + "epoch": 8.838851254651274, + "grad_norm": 0.0008816500776447356, + "learning_rate": 4.045332713050759e-06, + "loss": 0.0001, + "step": 5790 + }, + { + "epoch": 8.854116973571225, + "grad_norm": 0.0037713542114943266, + "learning_rate": 3.940988943134966e-06, + "loss": 0.0002, + "step": 5800 + }, + { + "epoch": 8.854116973571225, + "eval_loss": 0.09762009978294373, + "eval_runtime": 160.1885, + "eval_samples_per_second": 7.273, + "eval_steps_per_second": 7.273, + "step": 5800 + }, + { + "epoch": 8.869382692491175, + "grad_norm": 0.0035862771328538656, + "learning_rate": 3.837953287562146e-06, + "loss": 0.0001, + "step": 5810 + }, + { + "epoch": 8.884648411411124, + "grad_norm": 0.0011815731413662434, + "learning_rate": 3.7362286726308813e-06, + "loss": 0.0001, + "step": 5820 + }, + { + "epoch": 8.899914130331075, + "grad_norm": 0.0014588013291358948, + "learning_rate": 3.63581798740511e-06, + "loss": 0.0001, + "step": 5830 + }, + { + "epoch": 8.915179849251025, + "grad_norm": 0.0009648874402046204, + "learning_rate": 3.53672408363207e-06, + "loss": 0.0001, + "step": 5840 + }, + { + "epoch": 8.930445568170976, + "grad_norm": 0.00453564478084445, + "learning_rate": 3.438949775661304e-06, + "loss": 0.0004, + "step": 5850 + }, + { + "epoch": 8.930445568170976, + "eval_loss": 0.09873552620410919, + "eval_runtime": 160.1742, + "eval_samples_per_second": 7.273, + "eval_steps_per_second": 7.273, + "step": 5850 + }, + { + "epoch": 8.945711287090926, + "grad_norm": 0.0016943690134212375, + "learning_rate": 3.3424978403647446e-06, + "loss": 0.0001, + "step": 5860 + }, + { + "epoch": 8.960977006010877, + "grad_norm": 0.0011834116885438561, + "learning_rate": 3.24737101705781e-06, + "loss": 0.0, + "step": 5870 + }, + { + "epoch": 8.976242724930827, + "grad_norm": 0.00666852667927742, + "learning_rate": 3.1535720074216568e-06, + "loss": 0.0001, + "step": 5880 + }, + { + "epoch": 8.991508443850778, + "grad_norm": 0.004008435178548098, + "learning_rate": 3.06110347542643e-06, + "loss": 0.0002, + "step": 5890 + }, + { + "epoch": 9.006774162770729, + "grad_norm": 0.0012721408857032657, + "learning_rate": 2.969968047255578e-06, + "loss": 0.0008, + "step": 5900 + }, + { + "epoch": 9.006774162770729, + "eval_loss": 0.09944406896829605, + "eval_runtime": 160.1761, + "eval_samples_per_second": 7.273, + "eval_steps_per_second": 7.273, + "step": 5900 + }, + { + "epoch": 9.022039881690679, + "grad_norm": 0.00584765849635005, + "learning_rate": 2.8801683112313072e-06, + "loss": 0.0, + "step": 5910 + }, + { + "epoch": 9.03730560061063, + "grad_norm": 0.005886030849069357, + "learning_rate": 2.791706817741041e-06, + "loss": 0.0002, + "step": 5920 + }, + { + "epoch": 9.052571319530578, + "grad_norm": 0.0006561594200320542, + "learning_rate": 2.7045860791650057e-06, + "loss": 0.0001, + "step": 5930 + }, + { + "epoch": 9.067837038450529, + "grad_norm": 0.005868405103683472, + "learning_rate": 2.618808569804848e-06, + "loss": 0.0001, + "step": 5940 + }, + { + "epoch": 9.08310275737048, + "grad_norm": 0.007774353958666325, + "learning_rate": 2.5343767258134197e-06, + "loss": 0.0001, + "step": 5950 + }, + { + "epoch": 9.08310275737048, + "eval_loss": 0.10015840083360672, + "eval_runtime": 160.2538, + "eval_samples_per_second": 7.27, + "eval_steps_per_second": 7.27, + "step": 5950 + }, + { + "epoch": 9.09836847629043, + "grad_norm": 0.00030524193425662816, + "learning_rate": 2.451292945125516e-06, + "loss": 0.0001, + "step": 5960 + }, + { + "epoch": 9.11363419521038, + "grad_norm": 0.008615519851446152, + "learning_rate": 2.369559587389852e-06, + "loss": 0.0001, + "step": 5970 + }, + { + "epoch": 9.128899914130331, + "grad_norm": 0.0017607224872335792, + "learning_rate": 2.289178973901973e-06, + "loss": 0.0001, + "step": 5980 + }, + { + "epoch": 9.144165633050282, + "grad_norm": 0.008259010501205921, + "learning_rate": 2.210153387538366e-06, + "loss": 0.0002, + "step": 5990 + }, + { + "epoch": 9.159431351970232, + "grad_norm": 0.01984138786792755, + "learning_rate": 2.132485072691609e-06, + "loss": 0.0001, + "step": 6000 + }, + { + "epoch": 9.159431351970232, + "eval_loss": 0.10065776854753494, + "eval_runtime": 160.2435, + "eval_samples_per_second": 7.27, + "eval_steps_per_second": 7.27, + "step": 6000 + }, + { + "epoch": 9.174697070890183, + "grad_norm": 0.0024705601390451193, + "learning_rate": 2.056176235206664e-06, + "loss": 0.0002, + "step": 6010 + }, + { + "epoch": 9.189962789810133, + "grad_norm": 0.00856366753578186, + "learning_rate": 1.9812290423181666e-06, + "loss": 0.0005, + "step": 6020 + }, + { + "epoch": 9.205228508730084, + "grad_norm": 0.0022890339605510235, + "learning_rate": 1.907645622588933e-06, + "loss": 0.0009, + "step": 6030 + }, + { + "epoch": 9.220494227650033, + "grad_norm": 0.003478351281955838, + "learning_rate": 1.8354280658494649e-06, + "loss": 0.0001, + "step": 6040 + }, + { + "epoch": 9.235759946569983, + "grad_norm": 0.013585424050688744, + "learning_rate": 1.7645784231386519e-06, + "loss": 0.0001, + "step": 6050 + }, + { + "epoch": 9.235759946569983, + "eval_loss": 0.10102899372577667, + "eval_runtime": 160.0612, + "eval_samples_per_second": 7.278, + "eval_steps_per_second": 7.278, + "step": 6050 + }, + { + "epoch": 9.251025665489934, + "grad_norm": 0.0016542715020477772, + "learning_rate": 1.6950987066454428e-06, + "loss": 0.0003, + "step": 6060 + }, + { + "epoch": 9.266291384409884, + "grad_norm": 0.0008330322452820837, + "learning_rate": 1.6269908896517638e-06, + "loss": 0.0001, + "step": 6070 + }, + { + "epoch": 9.281557103329835, + "grad_norm": 0.009314282797276974, + "learning_rate": 1.5602569064764428e-06, + "loss": 0.0, + "step": 6080 + }, + { + "epoch": 9.296822822249785, + "grad_norm": 0.0006161289056763053, + "learning_rate": 1.4948986524202735e-06, + "loss": 0.0, + "step": 6090 + }, + { + "epoch": 9.312088541169736, + "grad_norm": 0.00438966229557991, + "learning_rate": 1.4309179837122044e-06, + "loss": 0.0001, + "step": 6100 + }, + { + "epoch": 9.312088541169736, + "eval_loss": 0.1017271876335144, + "eval_runtime": 160.1377, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 7.275, + "step": 6100 + }, + { + "epoch": 9.327354260089686, + "grad_norm": 0.021119097247719765, + "learning_rate": 1.368316717456608e-06, + "loss": 0.0001, + "step": 6110 + }, + { + "epoch": 9.342619979009637, + "grad_norm": 0.004084915854036808, + "learning_rate": 1.307096631581678e-06, + "loss": 0.0004, + "step": 6120 + }, + { + "epoch": 9.357885697929587, + "grad_norm": 0.012193146161735058, + "learning_rate": 1.2472594647889358e-06, + "loss": 0.0002, + "step": 6130 + }, + { + "epoch": 9.373151416849538, + "grad_norm": 0.001829653512686491, + "learning_rate": 1.188806916503843e-06, + "loss": 0.0, + "step": 6140 + }, + { + "epoch": 9.388417135769487, + "grad_norm": 0.00019697945390362293, + "learning_rate": 1.1317406468275393e-06, + "loss": 0.0003, + "step": 6150 + }, + { + "epoch": 9.388417135769487, + "eval_loss": 0.10189900547266006, + "eval_runtime": 160.1361, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 7.275, + "step": 6150 + }, + { + "epoch": 9.403682854689437, + "grad_norm": 0.0008664363413117826, + "learning_rate": 1.0760622764896866e-06, + "loss": 0.0001, + "step": 6160 + }, + { + "epoch": 9.418948573609388, + "grad_norm": 0.004286602605134249, + "learning_rate": 1.0217733868024715e-06, + "loss": 0.0, + "step": 6170 + }, + { + "epoch": 9.434214292529338, + "grad_norm": 0.0006451936205849051, + "learning_rate": 9.688755196156417e-07, + "loss": 0.0001, + "step": 6180 + }, + { + "epoch": 9.449480011449289, + "grad_norm": 0.004993918817490339, + "learning_rate": 9.17370177272775e-07, + "loss": 0.0011, + "step": 6190 + }, + { + "epoch": 9.46474573036924, + "grad_norm": 0.007353432942181826, + "learning_rate": 8.672588225685563e-07, + "loss": 0.0001, + "step": 6200 + }, + { + "epoch": 9.46474573036924, + "eval_loss": 0.10175897181034088, + "eval_runtime": 160.2659, + "eval_samples_per_second": 7.269, + "eval_steps_per_second": 7.269, + "step": 6200 + }, + { + "epoch": 9.48001144928919, + "grad_norm": 0.0025520974304527044, + "learning_rate": 8.185428787072736e-07, + "loss": 0.0, + "step": 6210 + }, + { + "epoch": 9.49527716820914, + "grad_norm": 0.0007525197579525411, + "learning_rate": 7.712237292623769e-07, + "loss": 0.0002, + "step": 6220 + }, + { + "epoch": 9.510542887129091, + "grad_norm": 0.0019715516828000546, + "learning_rate": 7.253027181371996e-07, + "loss": 0.0003, + "step": 6230 + }, + { + "epoch": 9.525808606049042, + "grad_norm": 0.008852859027683735, + "learning_rate": 6.807811495267602e-07, + "loss": 0.0, + "step": 6240 + }, + { + "epoch": 9.541074324968992, + "grad_norm": 0.009848060086369514, + "learning_rate": 6.376602878807591e-07, + "loss": 0.0006, + "step": 6250 + }, + { + "epoch": 9.541074324968992, + "eval_loss": 0.1022804006934166, + "eval_runtime": 160.2113, + "eval_samples_per_second": 7.272, + "eval_steps_per_second": 7.272, + "step": 6250 + }, + { + "epoch": 9.556340043888941, + "grad_norm": 0.0008501363336108625, + "learning_rate": 5.959413578676354e-07, + "loss": 0.0001, + "step": 6260 + }, + { + "epoch": 9.571605762808892, + "grad_norm": 0.001532863243483007, + "learning_rate": 5.556255443398217e-07, + "loss": 0.0001, + "step": 6270 + }, + { + "epoch": 9.586871481728842, + "grad_norm": 0.042831603437662125, + "learning_rate": 5.167139923000553e-07, + "loss": 0.0001, + "step": 6280 + }, + { + "epoch": 9.602137200648793, + "grad_norm": 0.0012665553949773312, + "learning_rate": 4.792078068688699e-07, + "loss": 0.0, + "step": 6290 + }, + { + "epoch": 9.617402919568743, + "grad_norm": 0.0022448324598371983, + "learning_rate": 4.431080532532439e-07, + "loss": 0.0001, + "step": 6300 + }, + { + "epoch": 9.617402919568743, + "eval_loss": 0.10223036259412766, + "eval_runtime": 160.2183, + "eval_samples_per_second": 7.271, + "eval_steps_per_second": 7.271, + "step": 6300 + }, + { + "epoch": 9.632668638488694, + "grad_norm": 0.0025191460736095905, + "learning_rate": 4.0841575671628495e-07, + "loss": 0.0, + "step": 6310 + }, + { + "epoch": 9.647934357408644, + "grad_norm": 0.005089775193482637, + "learning_rate": 3.7513190254816475e-07, + "loss": 0.0001, + "step": 6320 + }, + { + "epoch": 9.663200076328595, + "grad_norm": 0.11856232583522797, + "learning_rate": 3.43257436038108e-07, + "loss": 0.0002, + "step": 6330 + }, + { + "epoch": 9.678465795248545, + "grad_norm": 0.009028649888932705, + "learning_rate": 3.127932624475638e-07, + "loss": 0.0003, + "step": 6340 + }, + { + "epoch": 9.693731514168496, + "grad_norm": 0.20022647082805634, + "learning_rate": 2.837402469844708e-07, + "loss": 0.0006, + "step": 6350 + }, + { + "epoch": 9.693731514168496, + "eval_loss": 0.10262858867645264, + "eval_runtime": 160.2111, + "eval_samples_per_second": 7.272, + "eval_steps_per_second": 7.272, + "step": 6350 + }, + { + "epoch": 9.708997233088446, + "grad_norm": 0.08500424772500992, + "learning_rate": 2.560992147787211e-07, + "loss": 0.0004, + "step": 6360 + }, + { + "epoch": 9.724262952008397, + "grad_norm": 0.0030627259984612465, + "learning_rate": 2.2987095085867937e-07, + "loss": 0.0003, + "step": 6370 + }, + { + "epoch": 9.739528670928346, + "grad_norm": 9.990138642024249e-05, + "learning_rate": 2.0505620012893357e-07, + "loss": 0.0008, + "step": 6380 + }, + { + "epoch": 9.754794389848296, + "grad_norm": 0.00883451197296381, + "learning_rate": 1.81655667349101e-07, + "loss": 0.0001, + "step": 6390 + }, + { + "epoch": 9.770060108768247, + "grad_norm": 0.0019866861402988434, + "learning_rate": 1.5967001711383877e-07, + "loss": 0.0001, + "step": 6400 + }, + { + "epoch": 9.770060108768247, + "eval_loss": 0.10253015905618668, + "eval_runtime": 160.2358, + "eval_samples_per_second": 7.271, + "eval_steps_per_second": 7.271, + "step": 6400 + }, + { + "epoch": 9.785325827688197, + "grad_norm": 9.332612535217777e-05, + "learning_rate": 1.3909987383396438e-07, + "loss": 0.0001, + "step": 6410 + }, + { + "epoch": 9.800591546608148, + "grad_norm": 0.009237647987902164, + "learning_rate": 1.1994582171869773e-07, + "loss": 0.0, + "step": 6420 + }, + { + "epoch": 9.815857265528098, + "grad_norm": 0.01163227204233408, + "learning_rate": 1.0220840475910764e-07, + "loss": 0.0001, + "step": 6430 + }, + { + "epoch": 9.831122984448049, + "grad_norm": 0.005329756066203117, + "learning_rate": 8.588812671262992e-08, + "loss": 0.0001, + "step": 6440 + }, + { + "epoch": 9.846388703368, + "grad_norm": 0.0073890360072255135, + "learning_rate": 7.098545108877863e-08, + "loss": 0.0, + "step": 6450 + }, + { + "epoch": 9.846388703368, + "eval_loss": 0.10237367451190948, + "eval_runtime": 160.2764, + "eval_samples_per_second": 7.269, + "eval_steps_per_second": 7.269, + "step": 6450 + }, + { + "epoch": 9.86165442228795, + "grad_norm": 0.020936548709869385, + "learning_rate": 5.750080113598455e-08, + "loss": 0.0005, + "step": 6460 + }, + { + "epoch": 9.8769201412079, + "grad_norm": 0.011315480805933475, + "learning_rate": 4.543455982954359e-08, + "loss": 0.0001, + "step": 6470 + }, + { + "epoch": 9.892185860127851, + "grad_norm": 0.0018150415271520615, + "learning_rate": 3.4787069860786616e-08, + "loss": 0.0001, + "step": 6480 + }, + { + "epoch": 9.907451579047802, + "grad_norm": 0.021576542407274246, + "learning_rate": 2.5558633627303928e-08, + "loss": 0.0001, + "step": 6490 + }, + { + "epoch": 9.92271729796775, + "grad_norm": 0.001416212529875338, + "learning_rate": 1.7749513224391002e-08, + "loss": 0.0, + "step": 6500 + }, + { + "epoch": 9.92271729796775, + "eval_loss": 0.10233203321695328, + "eval_runtime": 160.2471, + "eval_samples_per_second": 7.27, + "eval_steps_per_second": 7.27, + "step": 6500 + }, + { + "epoch": 9.937983016887701, + "grad_norm": 0.001318891765549779, + "learning_rate": 1.1359930437582211e-08, + "loss": 0.0, + "step": 6510 + }, + { + "epoch": 9.953248735807652, + "grad_norm": 0.003906587604433298, + "learning_rate": 6.3900667363558934e-09, + "loss": 0.0001, + "step": 6520 + }, + { + "epoch": 9.968514454727602, + "grad_norm": 0.0006608036928810179, + "learning_rate": 2.8400632689884467e-09, + "loss": 0.0006, + "step": 6530 + }, + { + "epoch": 9.983780173647553, + "grad_norm": 0.000979644013568759, + "learning_rate": 7.10020858540883e-10, + "loss": 0.0, + "step": 6540 + }, + { + "epoch": 9.999045892567503, + "grad_norm": 0.005197067745029926, + "learning_rate": 0.0, + "loss": 0.0002, + "step": 6550 + }, + { + "epoch": 9.999045892567503, + "eval_loss": 0.10255397856235504, + "eval_runtime": 160.3566, + "eval_samples_per_second": 7.265, + "eval_steps_per_second": 7.265, + "step": 6550 + }, + { + "epoch": 9.999045892567503, + "step": 6550, + "total_flos": 1.6441614685569024e+18, + "train_loss": 0.029180648955241032, + "train_runtime": 72321.8375, + "train_samples_per_second": 1.449, + "train_steps_per_second": 0.091 + } + ], + "logging_steps": 10, + "max_steps": 6550, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6441614685569024e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}