{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7250168395527414, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008621851003637344, "grad_norm": 1.4002951383590698, "learning_rate": 1.4367816091954023e-07, "loss": 1.6797, "step": 10 }, { "epoch": 0.017243702007274687, "grad_norm": 1.374281644821167, "learning_rate": 2.8735632183908047e-07, "loss": 1.6722, "step": 20 }, { "epoch": 0.02586555301091203, "grad_norm": 1.2334840297698975, "learning_rate": 4.3103448275862073e-07, "loss": 1.6699, "step": 30 }, { "epoch": 0.034487404014549375, "grad_norm": 1.3353266716003418, "learning_rate": 5.747126436781609e-07, "loss": 1.6697, "step": 40 }, { "epoch": 0.043109255018186715, "grad_norm": 1.3006664514541626, "learning_rate": 7.183908045977011e-07, "loss": 1.6547, "step": 50 }, { "epoch": 0.05173110602182406, "grad_norm": 1.2290219068527222, "learning_rate": 8.620689655172415e-07, "loss": 1.6121, "step": 60 }, { "epoch": 0.0603529570254614, "grad_norm": 0.8605530261993408, "learning_rate": 1.0057471264367817e-06, "loss": 1.5626, "step": 70 }, { "epoch": 0.06897480802909875, "grad_norm": 0.5922505259513855, "learning_rate": 1.1494252873563219e-06, "loss": 1.5188, "step": 80 }, { "epoch": 0.07759665903273609, "grad_norm": 0.5861937999725342, "learning_rate": 1.2931034482758623e-06, "loss": 1.5159, "step": 90 }, { "epoch": 0.08621851003637343, "grad_norm": 0.5518978238105774, "learning_rate": 1.4367816091954023e-06, "loss": 1.4833, "step": 100 }, { "epoch": 0.09484036104001077, "grad_norm": 0.5048322677612305, "learning_rate": 1.5804597701149427e-06, "loss": 1.4932, "step": 110 }, { "epoch": 0.10346221204364812, "grad_norm": 0.6119816899299622, "learning_rate": 1.724137931034483e-06, "loss": 1.4988, "step": 120 }, { "epoch": 0.11208406304728546, "grad_norm": 0.5448249578475952, "learning_rate": 1.8678160919540231e-06, "loss": 1.4513, "step": 130 }, { "epoch": 0.1207059140509228, "grad_norm": 0.5504677295684814, "learning_rate": 2.0114942528735633e-06, "loss": 1.4618, "step": 140 }, { "epoch": 0.12932776505456015, "grad_norm": 0.5333659052848816, "learning_rate": 2.1551724137931035e-06, "loss": 1.4717, "step": 150 }, { "epoch": 0.1379496160581975, "grad_norm": 0.7873896360397339, "learning_rate": 2.2988505747126437e-06, "loss": 1.4464, "step": 160 }, { "epoch": 0.14657146706183483, "grad_norm": 0.9079521298408508, "learning_rate": 2.4425287356321844e-06, "loss": 1.4313, "step": 170 }, { "epoch": 0.15519331806547218, "grad_norm": 0.7075573801994324, "learning_rate": 2.5862068965517246e-06, "loss": 1.4527, "step": 180 }, { "epoch": 0.16381516906910953, "grad_norm": 0.5483418107032776, "learning_rate": 2.729885057471265e-06, "loss": 1.4109, "step": 190 }, { "epoch": 0.17243702007274686, "grad_norm": 0.5097762942314148, "learning_rate": 2.8735632183908046e-06, "loss": 1.3786, "step": 200 }, { "epoch": 0.18105887107638421, "grad_norm": 0.6269800066947937, "learning_rate": 3.017241379310345e-06, "loss": 1.4585, "step": 210 }, { "epoch": 0.18968072208002154, "grad_norm": 0.5950655937194824, "learning_rate": 3.1609195402298854e-06, "loss": 1.4093, "step": 220 }, { "epoch": 0.1983025730836589, "grad_norm": 0.597637414932251, "learning_rate": 3.3045977011494256e-06, "loss": 1.407, "step": 230 }, { "epoch": 0.20692442408729625, "grad_norm": 0.5727440118789673, "learning_rate": 3.448275862068966e-06, "loss": 1.3743, "step": 240 }, { "epoch": 0.21554627509093358, "grad_norm": 0.5026169419288635, "learning_rate": 3.5919540229885056e-06, "loss": 1.3868, "step": 250 }, { "epoch": 0.22416812609457093, "grad_norm": 0.5612446069717407, "learning_rate": 3.7356321839080462e-06, "loss": 1.3997, "step": 260 }, { "epoch": 0.23278997709820828, "grad_norm": 0.5654894709587097, "learning_rate": 3.8793103448275865e-06, "loss": 1.3925, "step": 270 }, { "epoch": 0.2414118281018456, "grad_norm": 0.5213720798492432, "learning_rate": 4.022988505747127e-06, "loss": 1.3982, "step": 280 }, { "epoch": 0.25003367910548296, "grad_norm": 0.6513163447380066, "learning_rate": 4.166666666666667e-06, "loss": 1.4087, "step": 290 }, { "epoch": 0.2586555301091203, "grad_norm": 0.5747817158699036, "learning_rate": 4.310344827586207e-06, "loss": 1.3629, "step": 300 }, { "epoch": 0.2672773811127577, "grad_norm": 0.5259600877761841, "learning_rate": 4.454022988505747e-06, "loss": 1.3932, "step": 310 }, { "epoch": 0.275899232116395, "grad_norm": 0.5602086782455444, "learning_rate": 4.5977011494252875e-06, "loss": 1.3537, "step": 320 }, { "epoch": 0.2845210831200323, "grad_norm": 0.6604083180427551, "learning_rate": 4.741379310344828e-06, "loss": 1.3651, "step": 330 }, { "epoch": 0.29314293412366965, "grad_norm": 0.5549290180206299, "learning_rate": 4.885057471264369e-06, "loss": 1.3452, "step": 340 }, { "epoch": 0.30176478512730703, "grad_norm": 0.548821210861206, "learning_rate": 4.999994959675734e-06, "loss": 1.3647, "step": 350 }, { "epoch": 0.31038663613094436, "grad_norm": 0.5312780141830444, "learning_rate": 4.9998185504603824e-06, "loss": 1.3404, "step": 360 }, { "epoch": 0.3190084871345817, "grad_norm": 0.5477195382118225, "learning_rate": 4.999390145355199e-06, "loss": 1.3841, "step": 370 }, { "epoch": 0.32763033813821907, "grad_norm": 0.5766547322273254, "learning_rate": 4.998709787545849e-06, "loss": 1.3594, "step": 380 }, { "epoch": 0.3362521891418564, "grad_norm": 0.5785459280014038, "learning_rate": 4.997777545616258e-06, "loss": 1.3402, "step": 390 }, { "epoch": 0.3448740401454937, "grad_norm": 0.5505363941192627, "learning_rate": 4.996593513541701e-06, "loss": 1.3355, "step": 400 }, { "epoch": 0.3534958911491311, "grad_norm": 0.5421465039253235, "learning_rate": 4.995157810679327e-06, "loss": 1.359, "step": 410 }, { "epoch": 0.36211774215276843, "grad_norm": 0.5955513119697571, "learning_rate": 4.993470581756129e-06, "loss": 1.3743, "step": 420 }, { "epoch": 0.37073959315640576, "grad_norm": 0.6144652366638184, "learning_rate": 4.991531996854352e-06, "loss": 1.3447, "step": 430 }, { "epoch": 0.3793614441600431, "grad_norm": 0.5953258872032166, "learning_rate": 4.989342251394352e-06, "loss": 1.3208, "step": 440 }, { "epoch": 0.38798329516368046, "grad_norm": 0.5650104880332947, "learning_rate": 4.986901566114891e-06, "loss": 1.3562, "step": 450 }, { "epoch": 0.3966051461673178, "grad_norm": 0.8173232078552246, "learning_rate": 4.984210187050891e-06, "loss": 1.3151, "step": 460 }, { "epoch": 0.4052269971709551, "grad_norm": 0.5520560145378113, "learning_rate": 4.981268385508627e-06, "loss": 1.3591, "step": 470 }, { "epoch": 0.4138488481745925, "grad_norm": 0.5939339399337769, "learning_rate": 4.978076458038382e-06, "loss": 1.3306, "step": 480 }, { "epoch": 0.4224706991782298, "grad_norm": 0.5531189441680908, "learning_rate": 4.974634726404551e-06, "loss": 1.3338, "step": 490 }, { "epoch": 0.43109255018186715, "grad_norm": 0.7108302116394043, "learning_rate": 4.9709435375532065e-06, "loss": 1.3248, "step": 500 }, { "epoch": 0.43109255018186715, "eval_loss": 1.210019826889038, "eval_runtime": 4375.303, "eval_samples_per_second": 15.08, "eval_steps_per_second": 7.54, "step": 500 }, { "epoch": 0.43971440118550453, "grad_norm": 0.47356271743774414, "learning_rate": 4.9670032635771205e-06, "loss": 1.3342, "step": 510 }, { "epoch": 0.44833625218914186, "grad_norm": 0.4977116286754608, "learning_rate": 4.962814301678262e-06, "loss": 1.3412, "step": 520 }, { "epoch": 0.4569581031927792, "grad_norm": 0.534755289554596, "learning_rate": 4.958377074127751e-06, "loss": 1.32, "step": 530 }, { "epoch": 0.46557995419641657, "grad_norm": 0.5627906918525696, "learning_rate": 4.953692028223295e-06, "loss": 1.3275, "step": 540 }, { "epoch": 0.4742018052000539, "grad_norm": 0.5472209453582764, "learning_rate": 4.948759636244096e-06, "loss": 1.3352, "step": 550 }, { "epoch": 0.4828236562036912, "grad_norm": 0.5113406777381897, "learning_rate": 4.943580395403244e-06, "loss": 1.31, "step": 560 }, { "epoch": 0.49144550720732855, "grad_norm": 0.6487182974815369, "learning_rate": 4.938154827797595e-06, "loss": 1.2995, "step": 570 }, { "epoch": 0.5000673582109659, "grad_norm": 0.6053293347358704, "learning_rate": 4.932483480355139e-06, "loss": 1.3377, "step": 580 }, { "epoch": 0.5086892092146033, "grad_norm": 0.5979019999504089, "learning_rate": 4.926566924779869e-06, "loss": 1.3169, "step": 590 }, { "epoch": 0.5173110602182406, "grad_norm": 0.6338688135147095, "learning_rate": 4.920405757494147e-06, "loss": 1.2965, "step": 600 }, { "epoch": 0.5259329112218779, "grad_norm": 0.5050321221351624, "learning_rate": 4.914000599578585e-06, "loss": 1.3246, "step": 610 }, { "epoch": 0.5345547622255153, "grad_norm": 0.5875179767608643, "learning_rate": 4.907352096709432e-06, "loss": 1.337, "step": 620 }, { "epoch": 0.5431766132291527, "grad_norm": 0.6425178647041321, "learning_rate": 4.900460919093492e-06, "loss": 1.2946, "step": 630 }, { "epoch": 0.55179846423279, "grad_norm": 0.541878342628479, "learning_rate": 4.893327761400557e-06, "loss": 1.2993, "step": 640 }, { "epoch": 0.5604203152364273, "grad_norm": 0.586501955986023, "learning_rate": 4.885953342693384e-06, "loss": 1.3011, "step": 650 }, { "epoch": 0.5690421662400647, "grad_norm": 0.5775993466377258, "learning_rate": 4.878338406355211e-06, "loss": 1.3213, "step": 660 }, { "epoch": 0.577664017243702, "grad_norm": 0.5908535718917847, "learning_rate": 4.870483720014814e-06, "loss": 1.2963, "step": 670 }, { "epoch": 0.5862858682473393, "grad_norm": 0.5903546810150146, "learning_rate": 4.862390075469132e-06, "loss": 1.2818, "step": 680 }, { "epoch": 0.5949077192509767, "grad_norm": 0.6688754558563232, "learning_rate": 4.854058288603445e-06, "loss": 1.3254, "step": 690 }, { "epoch": 0.6035295702546141, "grad_norm": 0.5674655437469482, "learning_rate": 4.8454891993091305e-06, "loss": 1.2957, "step": 700 }, { "epoch": 0.6121514212582514, "grad_norm": 0.6107905507087708, "learning_rate": 4.836683671398995e-06, "loss": 1.2824, "step": 710 }, { "epoch": 0.6207732722618887, "grad_norm": 0.5999839305877686, "learning_rate": 4.827642592520203e-06, "loss": 1.2977, "step": 720 }, { "epoch": 0.629395123265526, "grad_norm": 0.5449870824813843, "learning_rate": 4.818366874064789e-06, "loss": 1.2949, "step": 730 }, { "epoch": 0.6380169742691634, "grad_norm": 0.5735543966293335, "learning_rate": 4.808857451077788e-06, "loss": 1.3084, "step": 740 }, { "epoch": 0.6466388252728007, "grad_norm": 0.5688530802726746, "learning_rate": 4.799115282162979e-06, "loss": 1.2974, "step": 750 }, { "epoch": 0.6552606762764381, "grad_norm": 0.5878692269325256, "learning_rate": 4.789141349386249e-06, "loss": 1.3138, "step": 760 }, { "epoch": 0.6638825272800755, "grad_norm": 0.642494261264801, "learning_rate": 4.7789366581765995e-06, "loss": 1.285, "step": 770 }, { "epoch": 0.6725043782837128, "grad_norm": 0.6337887644767761, "learning_rate": 4.768502237224788e-06, "loss": 1.295, "step": 780 }, { "epoch": 0.6811262292873501, "grad_norm": 0.6511521935462952, "learning_rate": 4.757839138379635e-06, "loss": 1.3059, "step": 790 }, { "epoch": 0.6897480802909874, "grad_norm": 0.6140688061714172, "learning_rate": 4.74694843654199e-06, "loss": 1.2781, "step": 800 }, { "epoch": 0.6983699312946248, "grad_norm": 0.5881298780441284, "learning_rate": 4.735831229556374e-06, "loss": 1.2944, "step": 810 }, { "epoch": 0.7069917822982622, "grad_norm": 0.6124337315559387, "learning_rate": 4.7244886381003115e-06, "loss": 1.287, "step": 820 }, { "epoch": 0.7156136333018995, "grad_norm": 0.5487476587295532, "learning_rate": 4.712921805571362e-06, "loss": 1.2885, "step": 830 }, { "epoch": 0.7242354843055369, "grad_norm": 0.6456742286682129, "learning_rate": 4.7011318979718565e-06, "loss": 1.2899, "step": 840 }, { "epoch": 0.7328573353091742, "grad_norm": 0.5877824425697327, "learning_rate": 4.689120103791356e-06, "loss": 1.3066, "step": 850 }, { "epoch": 0.7414791863128115, "grad_norm": 0.628680408000946, "learning_rate": 4.676887633886851e-06, "loss": 1.3101, "step": 860 }, { "epoch": 0.7501010373164488, "grad_norm": 0.6239911317825317, "learning_rate": 4.664435721360695e-06, "loss": 1.2782, "step": 870 }, { "epoch": 0.7587228883200862, "grad_norm": 0.5513969659805298, "learning_rate": 4.651765621436303e-06, "loss": 1.2836, "step": 880 }, { "epoch": 0.7673447393237236, "grad_norm": 0.5616466403007507, "learning_rate": 4.638878611331615e-06, "loss": 1.2967, "step": 890 }, { "epoch": 0.7759665903273609, "grad_norm": 1.2961684465408325, "learning_rate": 4.6257759901303535e-06, "loss": 1.3094, "step": 900 }, { "epoch": 0.7845884413309983, "grad_norm": 0.6225080490112305, "learning_rate": 4.612459078651055e-06, "loss": 1.3083, "step": 910 }, { "epoch": 0.7932102923346356, "grad_norm": 0.6216508150100708, "learning_rate": 4.598929219313938e-06, "loss": 1.3286, "step": 920 }, { "epoch": 0.8018321433382729, "grad_norm": 0.5944140553474426, "learning_rate": 4.585187776005569e-06, "loss": 1.263, "step": 930 }, { "epoch": 0.8104539943419102, "grad_norm": 0.5992977023124695, "learning_rate": 4.571236133941381e-06, "loss": 1.2745, "step": 940 }, { "epoch": 0.8190758453455477, "grad_norm": 0.5519088506698608, "learning_rate": 4.557075699526032e-06, "loss": 1.2772, "step": 950 }, { "epoch": 0.827697696349185, "grad_norm": 0.5918429493904114, "learning_rate": 4.542707900211636e-06, "loss": 1.2915, "step": 960 }, { "epoch": 0.8363195473528223, "grad_norm": 0.6135639548301697, "learning_rate": 4.528134184353863e-06, "loss": 1.2918, "step": 970 }, { "epoch": 0.8449413983564596, "grad_norm": 0.6600371599197388, "learning_rate": 4.5133560210659384e-06, "loss": 1.2844, "step": 980 }, { "epoch": 0.853563249360097, "grad_norm": 0.6321092844009399, "learning_rate": 4.498374900070551e-06, "loss": 1.282, "step": 990 }, { "epoch": 0.8621851003637343, "grad_norm": 0.5802695155143738, "learning_rate": 4.483192331549675e-06, "loss": 1.2723, "step": 1000 }, { "epoch": 0.8621851003637343, "eval_loss": 1.1568914651870728, "eval_runtime": 4375.5203, "eval_samples_per_second": 15.08, "eval_steps_per_second": 7.54, "step": 1000 }, { "epoch": 0.8708069513673716, "grad_norm": 0.5625444650650024, "learning_rate": 4.467809845992338e-06, "loss": 1.2788, "step": 1010 }, { "epoch": 0.8794288023710091, "grad_norm": 0.575935959815979, "learning_rate": 4.452228994040341e-06, "loss": 1.302, "step": 1020 }, { "epoch": 0.8880506533746464, "grad_norm": 0.5979976058006287, "learning_rate": 4.4364513463319405e-06, "loss": 1.271, "step": 1030 }, { "epoch": 0.8966725043782837, "grad_norm": 0.6508215069770813, "learning_rate": 4.420478493343523e-06, "loss": 1.2838, "step": 1040 }, { "epoch": 0.905294355381921, "grad_norm": 0.6415181756019592, "learning_rate": 4.404312045229273e-06, "loss": 1.2855, "step": 1050 }, { "epoch": 0.9139162063855584, "grad_norm": 0.59377521276474, "learning_rate": 4.387953631658863e-06, "loss": 1.2745, "step": 1060 }, { "epoch": 0.9225380573891957, "grad_norm": 0.6269784569740295, "learning_rate": 4.371404901653174e-06, "loss": 1.2667, "step": 1070 }, { "epoch": 0.9311599083928331, "grad_norm": 0.6030882000923157, "learning_rate": 4.35466752341806e-06, "loss": 1.2433, "step": 1080 }, { "epoch": 0.9397817593964705, "grad_norm": 0.6197340488433838, "learning_rate": 4.337743184176188e-06, "loss": 1.2791, "step": 1090 }, { "epoch": 0.9484036104001078, "grad_norm": 0.607699453830719, "learning_rate": 4.320633589996956e-06, "loss": 1.278, "step": 1100 }, { "epoch": 0.9570254614037451, "grad_norm": 0.6275235414505005, "learning_rate": 4.303340465624507e-06, "loss": 1.2587, "step": 1110 }, { "epoch": 0.9656473124073824, "grad_norm": 0.6535059213638306, "learning_rate": 4.285865554303874e-06, "loss": 1.2895, "step": 1120 }, { "epoch": 0.9742691634110198, "grad_norm": 0.6479883790016174, "learning_rate": 4.2682106176052405e-06, "loss": 1.2651, "step": 1130 }, { "epoch": 0.9828910144146571, "grad_norm": 0.7725274562835693, "learning_rate": 4.2503774352463735e-06, "loss": 1.2384, "step": 1140 }, { "epoch": 0.9915128654182945, "grad_norm": 0.6182934641838074, "learning_rate": 4.23236780491321e-06, "loss": 1.2723, "step": 1150 }, { "epoch": 1.0007813552472047, "grad_norm": 1.8191434144973755, "learning_rate": 4.214183542078646e-06, "loss": 1.3882, "step": 1160 }, { "epoch": 1.009403206250842, "grad_norm": 0.7100806832313538, "learning_rate": 4.195826479819523e-06, "loss": 1.2857, "step": 1170 }, { "epoch": 1.0180250572544793, "grad_norm": 0.5903263688087463, "learning_rate": 4.177298468631844e-06, "loss": 1.2888, "step": 1180 }, { "epoch": 1.0266469082581167, "grad_norm": 0.6088208556175232, "learning_rate": 4.158601376244237e-06, "loss": 1.2355, "step": 1190 }, { "epoch": 1.035268759261754, "grad_norm": 0.6548230648040771, "learning_rate": 4.139737087429672e-06, "loss": 1.2435, "step": 1200 }, { "epoch": 1.0438906102653913, "grad_norm": 0.6475362777709961, "learning_rate": 4.120707503815464e-06, "loss": 1.2462, "step": 1210 }, { "epoch": 1.0525124612690286, "grad_norm": 0.7016700506210327, "learning_rate": 4.101514543691588e-06, "loss": 1.2479, "step": 1220 }, { "epoch": 1.061134312272666, "grad_norm": 0.6940033435821533, "learning_rate": 4.0821601418172926e-06, "loss": 1.2659, "step": 1230 }, { "epoch": 1.0697561632763033, "grad_norm": 0.6648741960525513, "learning_rate": 4.0626462492260725e-06, "loss": 1.2441, "step": 1240 }, { "epoch": 1.0783780142799406, "grad_norm": 0.665122389793396, "learning_rate": 4.042974833028992e-06, "loss": 1.2792, "step": 1250 }, { "epoch": 1.0869998652835782, "grad_norm": 0.6138463020324707, "learning_rate": 4.0231478762163865e-06, "loss": 1.2462, "step": 1260 }, { "epoch": 1.0956217162872155, "grad_norm": 0.61916184425354, "learning_rate": 4.003167377457972e-06, "loss": 1.2858, "step": 1270 }, { "epoch": 1.1042435672908528, "grad_norm": 0.6411153674125671, "learning_rate": 3.983035350901356e-06, "loss": 1.2519, "step": 1280 }, { "epoch": 1.1128654182944901, "grad_norm": 0.6579316854476929, "learning_rate": 3.962753825969016e-06, "loss": 1.2661, "step": 1290 }, { "epoch": 1.1214872692981275, "grad_norm": 0.6916026473045349, "learning_rate": 3.942324847153706e-06, "loss": 1.2812, "step": 1300 }, { "epoch": 1.1301091203017648, "grad_norm": 0.6541363596916199, "learning_rate": 3.921750473812377e-06, "loss": 1.2454, "step": 1310 }, { "epoch": 1.1387309713054021, "grad_norm": 0.6301002502441406, "learning_rate": 3.901032779958563e-06, "loss": 1.2452, "step": 1320 }, { "epoch": 1.1473528223090395, "grad_norm": 0.6470747590065002, "learning_rate": 3.880173854053325e-06, "loss": 1.242, "step": 1330 }, { "epoch": 1.1559746733126768, "grad_norm": 0.62432861328125, "learning_rate": 3.859175798794715e-06, "loss": 1.2578, "step": 1340 }, { "epoch": 1.164596524316314, "grad_norm": 0.735650897026062, "learning_rate": 3.838040730905811e-06, "loss": 1.2323, "step": 1350 }, { "epoch": 1.1732183753199514, "grad_norm": 0.6072832345962524, "learning_rate": 3.816770780921343e-06, "loss": 1.2417, "step": 1360 }, { "epoch": 1.1818402263235888, "grad_norm": 0.6269782185554504, "learning_rate": 3.7953680929729215e-06, "loss": 1.2579, "step": 1370 }, { "epoch": 1.190462077327226, "grad_norm": 0.6426697373390198, "learning_rate": 3.7738348245728953e-06, "loss": 1.2711, "step": 1380 }, { "epoch": 1.1990839283308636, "grad_norm": 0.6683219075202942, "learning_rate": 3.7521731463968638e-06, "loss": 1.2375, "step": 1390 }, { "epoch": 1.207705779334501, "grad_norm": 0.7327633500099182, "learning_rate": 3.730385242064861e-06, "loss": 1.2509, "step": 1400 }, { "epoch": 1.2163276303381383, "grad_norm": 0.6698377728462219, "learning_rate": 3.708473307921234e-06, "loss": 1.2748, "step": 1410 }, { "epoch": 1.2249494813417756, "grad_norm": 0.6427878737449646, "learning_rate": 3.686439552813236e-06, "loss": 1.2753, "step": 1420 }, { "epoch": 1.233571332345413, "grad_norm": 0.7282299399375916, "learning_rate": 3.6642861978683676e-06, "loss": 1.2218, "step": 1430 }, { "epoch": 1.2421931833490503, "grad_norm": 0.6039260029792786, "learning_rate": 3.6420154762704685e-06, "loss": 1.243, "step": 1440 }, { "epoch": 1.2508150343526876, "grad_norm": 0.6218879222869873, "learning_rate": 3.619629633034604e-06, "loss": 1.2225, "step": 1450 }, { "epoch": 1.259436885356325, "grad_norm": 0.660929799079895, "learning_rate": 3.597130924780754e-06, "loss": 1.2641, "step": 1460 }, { "epoch": 1.2680587363599622, "grad_norm": 0.6086330413818359, "learning_rate": 3.574521619506332e-06, "loss": 1.2288, "step": 1470 }, { "epoch": 1.2766805873635996, "grad_norm": 0.6594045162200928, "learning_rate": 3.5518039963575577e-06, "loss": 1.2558, "step": 1480 }, { "epoch": 1.285302438367237, "grad_norm": 0.6506398320198059, "learning_rate": 3.5289803453997087e-06, "loss": 1.2361, "step": 1490 }, { "epoch": 1.2939242893708744, "grad_norm": 0.6286528706550598, "learning_rate": 3.506052967386265e-06, "loss": 1.2344, "step": 1500 }, { "epoch": 1.2939242893708744, "eval_loss": 1.1364344358444214, "eval_runtime": 4371.3293, "eval_samples_per_second": 15.094, "eval_steps_per_second": 7.547, "step": 1500 }, { "epoch": 1.3025461403745116, "grad_norm": 0.6357390880584717, "learning_rate": 3.4830241735269852e-06, "loss": 1.2597, "step": 1510 }, { "epoch": 1.311167991378149, "grad_norm": 0.5889900326728821, "learning_rate": 3.459896285254917e-06, "loss": 1.2535, "step": 1520 }, { "epoch": 1.3197898423817862, "grad_norm": 0.7132574319839478, "learning_rate": 3.436671633992389e-06, "loss": 1.2496, "step": 1530 }, { "epoch": 1.3284116933854238, "grad_norm": 0.604434072971344, "learning_rate": 3.4133525609159883e-06, "loss": 1.2578, "step": 1540 }, { "epoch": 1.337033544389061, "grad_norm": 0.6603388786315918, "learning_rate": 3.3899414167205547e-06, "loss": 1.2462, "step": 1550 }, { "epoch": 1.3456553953926984, "grad_norm": 0.5738435983657837, "learning_rate": 3.3664405613822216e-06, "loss": 1.2309, "step": 1560 }, { "epoch": 1.3542772463963357, "grad_norm": 0.6693400740623474, "learning_rate": 3.3428523639205125e-06, "loss": 1.2656, "step": 1570 }, { "epoch": 1.362899097399973, "grad_norm": 0.6772233843803406, "learning_rate": 3.319179202159532e-06, "loss": 1.2326, "step": 1580 }, { "epoch": 1.3715209484036104, "grad_norm": 0.6765257716178894, "learning_rate": 3.295423462488271e-06, "loss": 1.2666, "step": 1590 }, { "epoch": 1.3801427994072477, "grad_norm": 0.61844402551651, "learning_rate": 3.271587539620039e-06, "loss": 1.2188, "step": 1600 }, { "epoch": 1.388764650410885, "grad_norm": 0.6714752912521362, "learning_rate": 3.247673836351068e-06, "loss": 1.2276, "step": 1610 }, { "epoch": 1.3973865014145224, "grad_norm": 0.5900276899337769, "learning_rate": 3.2236847633182955e-06, "loss": 1.2452, "step": 1620 }, { "epoch": 1.40600835241816, "grad_norm": 0.6843028664588928, "learning_rate": 3.199622738756357e-06, "loss": 1.2317, "step": 1630 }, { "epoch": 1.414630203421797, "grad_norm": 0.7222546935081482, "learning_rate": 3.17549018825382e-06, "loss": 1.2445, "step": 1640 }, { "epoch": 1.4232520544254346, "grad_norm": 0.6822832226753235, "learning_rate": 3.151289544508664e-06, "loss": 1.2442, "step": 1650 }, { "epoch": 1.4318739054290717, "grad_norm": 0.7010654211044312, "learning_rate": 3.1270232470830525e-06, "loss": 1.2517, "step": 1660 }, { "epoch": 1.4404957564327092, "grad_norm": 0.6761536598205566, "learning_rate": 3.102693742157415e-06, "loss": 1.2424, "step": 1670 }, { "epoch": 1.4491176074363465, "grad_norm": 0.730097234249115, "learning_rate": 3.078303482283854e-06, "loss": 1.2167, "step": 1680 }, { "epoch": 1.4577394584399839, "grad_norm": 0.7009713053703308, "learning_rate": 3.0538549261389154e-06, "loss": 1.2492, "step": 1690 }, { "epoch": 1.4663613094436212, "grad_norm": 0.5926857590675354, "learning_rate": 3.029350538275742e-06, "loss": 1.1965, "step": 1700 }, { "epoch": 1.4749831604472585, "grad_norm": 0.6391776204109192, "learning_rate": 3.0047927888756268e-06, "loss": 1.2326, "step": 1710 }, { "epoch": 1.4836050114508958, "grad_norm": 0.7003401517868042, "learning_rate": 2.9801841534990115e-06, "loss": 1.2248, "step": 1720 }, { "epoch": 1.4922268624545332, "grad_norm": 0.682777464389801, "learning_rate": 2.9555271128359326e-06, "loss": 1.2305, "step": 1730 }, { "epoch": 1.5008487134581705, "grad_norm": 0.5897073745727539, "learning_rate": 2.9308241524559522e-06, "loss": 1.2269, "step": 1740 }, { "epoch": 1.5094705644618078, "grad_norm": 0.7111027240753174, "learning_rate": 2.9060777625576014e-06, "loss": 1.2338, "step": 1750 }, { "epoch": 1.5180924154654454, "grad_norm": 0.6545217037200928, "learning_rate": 2.8812904377173532e-06, "loss": 1.2222, "step": 1760 }, { "epoch": 1.5267142664690825, "grad_norm": 0.6440667510032654, "learning_rate": 2.856464676638156e-06, "loss": 1.2033, "step": 1770 }, { "epoch": 1.53533611747272, "grad_norm": 0.7168214321136475, "learning_rate": 2.831602981897546e-06, "loss": 1.2479, "step": 1780 }, { "epoch": 1.5439579684763571, "grad_norm": 0.6428610682487488, "learning_rate": 2.8067078596953793e-06, "loss": 1.2302, "step": 1790 }, { "epoch": 1.5525798194799947, "grad_norm": 0.6651865839958191, "learning_rate": 2.7817818196011897e-06, "loss": 1.263, "step": 1800 }, { "epoch": 1.561201670483632, "grad_norm": 0.6888891458511353, "learning_rate": 2.756827374301207e-06, "loss": 1.2001, "step": 1810 }, { "epoch": 1.5698235214872693, "grad_norm": 0.6644035577774048, "learning_rate": 2.73184703934507e-06, "loss": 1.216, "step": 1820 }, { "epoch": 1.5784453724909067, "grad_norm": 0.6795063614845276, "learning_rate": 2.7068433328922405e-06, "loss": 1.245, "step": 1830 }, { "epoch": 1.587067223494544, "grad_norm": 0.7901127338409424, "learning_rate": 2.68181877545816e-06, "loss": 1.2168, "step": 1840 }, { "epoch": 1.5956890744981813, "grad_norm": 0.6792474389076233, "learning_rate": 2.6567758896601654e-06, "loss": 1.2406, "step": 1850 }, { "epoch": 1.6043109255018186, "grad_norm": 0.638313353061676, "learning_rate": 2.6317171999631992e-06, "loss": 1.253, "step": 1860 }, { "epoch": 1.612932776505456, "grad_norm": 0.7407149076461792, "learning_rate": 2.6066452324253257e-06, "loss": 1.2279, "step": 1870 }, { "epoch": 1.6215546275090933, "grad_norm": 0.6624804139137268, "learning_rate": 2.58156251444309e-06, "loss": 1.2433, "step": 1880 }, { "epoch": 1.6301764785127308, "grad_norm": 0.6785764694213867, "learning_rate": 2.5564715744967446e-06, "loss": 1.2267, "step": 1890 }, { "epoch": 1.638798329516368, "grad_norm": 0.7038357853889465, "learning_rate": 2.531374941895361e-06, "loss": 1.2371, "step": 1900 }, { "epoch": 1.6474201805200055, "grad_norm": 0.7683678269386292, "learning_rate": 2.506275146521863e-06, "loss": 1.2039, "step": 1910 }, { "epoch": 1.6560420315236426, "grad_norm": 0.6339368224143982, "learning_rate": 2.4811747185780005e-06, "loss": 1.201, "step": 1920 }, { "epoch": 1.6646638825272801, "grad_norm": 0.8253235220909119, "learning_rate": 2.45607618832929e-06, "loss": 1.2585, "step": 1930 }, { "epoch": 1.6732857335309175, "grad_norm": 0.7511754631996155, "learning_rate": 2.4309820858499487e-06, "loss": 1.2043, "step": 1940 }, { "epoch": 1.6819075845345548, "grad_norm": 0.709459662437439, "learning_rate": 2.405894940767851e-06, "loss": 1.2493, "step": 1950 }, { "epoch": 1.6905294355381921, "grad_norm": 0.6520094871520996, "learning_rate": 2.380817282009523e-06, "loss": 1.2514, "step": 1960 }, { "epoch": 1.6991512865418295, "grad_norm": 0.6714244484901428, "learning_rate": 2.35575163754522e-06, "loss": 1.2204, "step": 1970 }, { "epoch": 1.7077731375454668, "grad_norm": 0.6813965439796448, "learning_rate": 2.330700534134086e-06, "loss": 1.2042, "step": 1980 }, { "epoch": 1.716394988549104, "grad_norm": 0.6882847547531128, "learning_rate": 2.3056664970694433e-06, "loss": 1.2139, "step": 1990 }, { "epoch": 1.7250168395527414, "grad_norm": 0.7284813523292542, "learning_rate": 2.280652049924232e-06, "loss": 1.2124, "step": 2000 }, { "epoch": 1.7250168395527414, "eval_loss": 1.1232779026031494, "eval_runtime": 4379.8083, "eval_samples_per_second": 15.065, "eval_steps_per_second": 7.533, "step": 2000 } ], "logging_steps": 10, "max_steps": 3477, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.949667944935509e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }