{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500.0, "global_step": 6237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02405002405002405, "grad_norm": 0.5065411329269409, "learning_rate": 0.00019996828714700116, "loss": 1.7914, "step": 50 }, { "epoch": 0.0481000481000481, "grad_norm": 0.34842124581336975, "learning_rate": 0.00019987316870210547, "loss": 1.2136, "step": 100 }, { "epoch": 0.07215007215007214, "grad_norm": 0.3516596257686615, "learning_rate": 0.0001997147049948582, "loss": 1.1937, "step": 150 }, { "epoch": 0.0962000962000962, "grad_norm": 0.4468790888786316, "learning_rate": 0.0001994929965319844, "loss": 1.181, "step": 200 }, { "epoch": 0.12025012025012025, "grad_norm": 0.32536762952804565, "learning_rate": 0.0001992081839336419, "loss": 1.1738, "step": 250 }, { "epoch": 0.1443001443001443, "grad_norm": 0.2973495125770569, "learning_rate": 0.00019886044784423197, "loss": 1.1665, "step": 300 }, { "epoch": 0.16835016835016836, "grad_norm": 0.35850659012794495, "learning_rate": 0.00019845000881782432, "loss": 1.1676, "step": 350 }, { "epoch": 0.1924001924001924, "grad_norm": 6.042001724243164, "learning_rate": 0.00019797712717826914, "loss": 1.1572, "step": 400 }, { "epoch": 0.21645021645021645, "grad_norm": 0.2577451765537262, "learning_rate": 0.00019744210285408488, "loss": 1.1686, "step": 450 }, { "epoch": 0.2405002405002405, "grad_norm": 0.32454270124435425, "learning_rate": 0.0001968452751882264, "loss": 1.1575, "step": 500 }, { "epoch": 0.26455026455026454, "grad_norm": 0.30101215839385986, "learning_rate": 0.00019618702272285434, "loss": 1.159, "step": 550 }, { "epoch": 0.2886002886002886, "grad_norm": 0.6339617967605591, "learning_rate": 0.00019546776295924212, "loss": 1.1571, "step": 600 }, { "epoch": 0.3126503126503126, "grad_norm": 0.2634935975074768, "learning_rate": 0.0001946879520929728, "loss": 1.1475, "step": 650 }, { "epoch": 0.3367003367003367, "grad_norm": 0.292190819978714, "learning_rate": 0.00019384808472459368, "loss": 1.1545, "step": 700 }, { "epoch": 0.36075036075036077, "grad_norm": 0.20567256212234497, "learning_rate": 0.0001929486935459127, "loss": 1.1391, "step": 750 }, { "epoch": 0.3848003848003848, "grad_norm": 0.2804057002067566, "learning_rate": 0.00019199034900213452, "loss": 1.1427, "step": 800 }, { "epoch": 0.40885040885040885, "grad_norm": 0.3674640357494354, "learning_rate": 0.000190973658930052, "loss": 1.1448, "step": 850 }, { "epoch": 0.4329004329004329, "grad_norm": 0.9717519283294678, "learning_rate": 0.00018989926817252113, "loss": 1.1418, "step": 900 }, { "epoch": 0.45695045695045694, "grad_norm": 0.30908727645874023, "learning_rate": 0.00018876785816946505, "loss": 1.1368, "step": 950 }, { "epoch": 0.481000481000481, "grad_norm": 0.23766139149665833, "learning_rate": 0.00018758014652566597, "loss": 1.1464, "step": 1000 }, { "epoch": 0.5050505050505051, "grad_norm": 0.23821455240249634, "learning_rate": 0.0001863368865556191, "loss": 1.1366, "step": 1050 }, { "epoch": 0.5291005291005291, "grad_norm": 0.26834988594055176, "learning_rate": 0.0001850388668057379, "loss": 1.1345, "step": 1100 }, { "epoch": 0.5531505531505532, "grad_norm": 0.24402070045471191, "learning_rate": 0.0001836869105542127, "loss": 1.1354, "step": 1150 }, { "epoch": 0.5772005772005772, "grad_norm": 0.257290244102478, "learning_rate": 0.0001822818752888408, "loss": 1.1351, "step": 1200 }, { "epoch": 0.6012506012506013, "grad_norm": 0.22287152707576752, "learning_rate": 0.00018082465216315882, "loss": 1.1273, "step": 1250 }, { "epoch": 0.6253006253006252, "grad_norm": 0.2477453052997589, "learning_rate": 0.00017931616543122214, "loss": 1.1369, "step": 1300 }, { "epoch": 0.6493506493506493, "grad_norm": 0.34627246856689453, "learning_rate": 0.00017775737186139038, "loss": 1.122, "step": 1350 }, { "epoch": 0.6734006734006734, "grad_norm": 0.2759104073047638, "learning_rate": 0.00017614926012949028, "loss": 1.1216, "step": 1400 }, { "epoch": 0.6974506974506974, "grad_norm": 0.2591134011745453, "learning_rate": 0.00017449285019174098, "loss": 1.13, "step": 1450 }, { "epoch": 0.7215007215007215, "grad_norm": 0.20448699593544006, "learning_rate": 0.00017278919263783978, "loss": 1.1289, "step": 1500 }, { "epoch": 0.7455507455507455, "grad_norm": 0.2126525342464447, "learning_rate": 0.00017103936802461797, "loss": 1.1256, "step": 1550 }, { "epoch": 0.7696007696007696, "grad_norm": 0.18976248800754547, "learning_rate": 0.00016924448619069023, "loss": 1.1172, "step": 1600 }, { "epoch": 0.7936507936507936, "grad_norm": 0.22607876360416412, "learning_rate": 0.00016740568555253155, "loss": 1.1245, "step": 1650 }, { "epoch": 0.8177008177008177, "grad_norm": 0.24248099327087402, "learning_rate": 0.00016552413238242857, "loss": 1.1228, "step": 1700 }, { "epoch": 0.8417508417508418, "grad_norm": 0.20902663469314575, "learning_rate": 0.00016360102006876317, "loss": 1.1176, "step": 1750 }, { "epoch": 0.8658008658008658, "grad_norm": 0.2559059262275696, "learning_rate": 0.0001616375683590974, "loss": 1.1187, "step": 1800 }, { "epoch": 0.8898508898508899, "grad_norm": 0.2621828317642212, "learning_rate": 0.00015963502258654005, "loss": 1.1103, "step": 1850 }, { "epoch": 0.9139009139009139, "grad_norm": 0.17843465507030487, "learning_rate": 0.0001575946528798853, "loss": 1.1129, "step": 1900 }, { "epoch": 0.937950937950938, "grad_norm": 0.22196488082408905, "learning_rate": 0.0001555177533580245, "loss": 1.1121, "step": 1950 }, { "epoch": 0.962000962000962, "grad_norm": 0.20080924034118652, "learning_rate": 0.00015340564130914233, "loss": 1.1104, "step": 2000 }, { "epoch": 0.9860509860509861, "grad_norm": 0.17231349647045135, "learning_rate": 0.00015125965635521724, "loss": 1.1171, "step": 2050 }, { "epoch": 1.0101010101010102, "grad_norm": 0.17805251479148865, "learning_rate": 0.00014908115960235682, "loss": 1.0872, "step": 2100 }, { "epoch": 1.034151034151034, "grad_norm": 0.18555937707424164, "learning_rate": 0.00014687153277750676, "loss": 1.0656, "step": 2150 }, { "epoch": 1.0582010582010581, "grad_norm": 0.18195118010044098, "learning_rate": 0.00014463217735208062, "loss": 1.0599, "step": 2200 }, { "epoch": 1.0822510822510822, "grad_norm": 0.23314547538757324, "learning_rate": 0.00014236451365306674, "loss": 1.0696, "step": 2250 }, { "epoch": 1.1063011063011063, "grad_norm": 0.22400447726249695, "learning_rate": 0.00014006997996217593, "loss": 1.0733, "step": 2300 }, { "epoch": 1.1303511303511304, "grad_norm": 0.1384362429380417, "learning_rate": 0.00013775003160360096, "loss": 1.0567, "step": 2350 }, { "epoch": 1.1544011544011543, "grad_norm": 0.15328723192214966, "learning_rate": 0.00013540614002096701, "loss": 1.071, "step": 2400 }, { "epoch": 1.1784511784511784, "grad_norm": 0.15236607193946838, "learning_rate": 0.00013303979184405826, "loss": 1.0684, "step": 2450 }, { "epoch": 1.2025012025012025, "grad_norm": 0.5659245252609253, "learning_rate": 0.00013065248794591223, "loss": 1.0666, "step": 2500 }, { "epoch": 1.2265512265512266, "grad_norm": 0.18339212238788605, "learning_rate": 0.00012824574249088063, "loss": 1.0691, "step": 2550 }, { "epoch": 1.2506012506012505, "grad_norm": 0.15872247517108917, "learning_rate": 0.0001258210819742599, "loss": 1.0642, "step": 2600 }, { "epoch": 1.2746512746512746, "grad_norm": 0.17531836032867432, "learning_rate": 0.00012338004425410074, "loss": 1.061, "step": 2650 }, { "epoch": 1.2987012987012987, "grad_norm": 0.17102229595184326, "learning_rate": 0.00012092417757581085, "loss": 1.0651, "step": 2700 }, { "epoch": 1.3227513227513228, "grad_norm": 0.19855375587940216, "learning_rate": 0.00011845503959016928, "loss": 1.0641, "step": 2750 }, { "epoch": 1.3468013468013469, "grad_norm": 0.1624690294265747, "learning_rate": 0.0001159741963653755, "loss": 1.0575, "step": 2800 }, { "epoch": 1.370851370851371, "grad_norm": 0.15423128008842468, "learning_rate": 0.00011348322139375948, "loss": 1.0695, "step": 2850 }, { "epoch": 1.3949013949013949, "grad_norm": 0.12848949432373047, "learning_rate": 0.00011098369459378328, "loss": 1.0655, "step": 2900 }, { "epoch": 1.418951418951419, "grad_norm": 0.1720573604106903, "learning_rate": 0.00010847720130796631, "loss": 1.0641, "step": 2950 }, { "epoch": 1.443001443001443, "grad_norm": 0.17134524881839752, "learning_rate": 0.00010596533129737092, "loss": 1.0651, "step": 3000 }, { "epoch": 1.467051467051467, "grad_norm": 0.13950586318969727, "learning_rate": 0.00010344967773328507, "loss": 1.0579, "step": 3050 }, { "epoch": 1.491101491101491, "grad_norm": 0.19055236876010895, "learning_rate": 0.00010093183618674224, "loss": 1.0528, "step": 3100 }, { "epoch": 1.5151515151515151, "grad_norm": 0.16765938699245453, "learning_rate": 9.84134036165192e-05, "loss": 1.0547, "step": 3150 }, { "epoch": 1.5392015392015392, "grad_norm": 0.15793581306934357, "learning_rate": 9.589597735625377e-05, "loss": 1.0615, "step": 3200 }, { "epoch": 1.5632515632515633, "grad_norm": 0.16215017437934875, "learning_rate": 9.338115410132441e-05, "loss": 1.0586, "step": 3250 }, { "epoch": 1.5873015873015874, "grad_norm": 0.21864274144172668, "learning_rate": 9.087052889613518e-05, "loss": 1.0609, "step": 3300 }, { "epoch": 1.6113516113516113, "grad_norm": 0.1665191650390625, "learning_rate": 8.836569412244745e-05, "loss": 1.0595, "step": 3350 }, { "epoch": 1.6354016354016354, "grad_norm": 0.15962587296962738, "learning_rate": 8.586823848940047e-05, "loss": 1.0515, "step": 3400 }, { "epoch": 1.6594516594516593, "grad_norm": 0.18745359778404236, "learning_rate": 8.337974602586152e-05, "loss": 1.0592, "step": 3450 }, { "epoch": 1.6835016835016834, "grad_norm": 0.19344636797904968, "learning_rate": 8.090179507574427e-05, "loss": 1.0478, "step": 3500 }, { "epoch": 1.7075517075517075, "grad_norm": 0.13223913311958313, "learning_rate": 7.843595729693316e-05, "loss": 1.045, "step": 3550 }, { "epoch": 1.7316017316017316, "grad_norm": 0.17938382923603058, "learning_rate": 7.598379666444808e-05, "loss": 1.0537, "step": 3600 }, { "epoch": 1.7556517556517557, "grad_norm": 0.15650674700737, "learning_rate": 7.354686847848242e-05, "loss": 1.0498, "step": 3650 }, { "epoch": 1.7797017797017798, "grad_norm": 0.2204657793045044, "learning_rate": 7.11267183779428e-05, "loss": 1.0495, "step": 3700 }, { "epoch": 1.8037518037518039, "grad_norm": 0.1426030993461609, "learning_rate": 6.872488136011667e-05, "loss": 1.0538, "step": 3750 }, { "epoch": 1.8278018278018278, "grad_norm": 0.1690046787261963, "learning_rate": 6.634288080708952e-05, "loss": 1.0509, "step": 3800 }, { "epoch": 1.8518518518518519, "grad_norm": 0.1793077439069748, "learning_rate": 6.398222751952899e-05, "loss": 1.0457, "step": 3850 }, { "epoch": 1.8759018759018757, "grad_norm": 0.13664492964744568, "learning_rate": 6.164441875844882e-05, "loss": 1.0516, "step": 3900 }, { "epoch": 1.8999518999518998, "grad_norm": 0.1282956898212433, "learning_rate": 5.933093729556062e-05, "loss": 1.0486, "step": 3950 }, { "epoch": 1.924001924001924, "grad_norm": 0.13937096297740936, "learning_rate": 5.7043250472815356e-05, "loss": 1.0411, "step": 4000 }, { "epoch": 1.948051948051948, "grad_norm": 0.1659151166677475, "learning_rate": 5.478280927173145e-05, "loss": 1.0378, "step": 4050 }, { "epoch": 1.9721019721019721, "grad_norm": 0.1273048371076584, "learning_rate": 5.255104739309924e-05, "loss": 1.0444, "step": 4100 }, { "epoch": 1.9961519961519962, "grad_norm": 0.17163003981113434, "learning_rate": 5.0349380347646494e-05, "loss": 1.0399, "step": 4150 }, { "epoch": 2.0202020202020203, "grad_norm": 0.12823982536792755, "learning_rate": 4.8179204558240444e-05, "loss": 1.0092, "step": 4200 }, { "epoch": 2.0442520442520444, "grad_norm": 0.1164403036236763, "learning_rate": 4.6041896474197e-05, "loss": 0.9951, "step": 4250 }, { "epoch": 2.068302068302068, "grad_norm": 0.1204572319984436, "learning_rate": 4.393881169825779e-05, "loss": 0.9998, "step": 4300 }, { "epoch": 2.092352092352092, "grad_norm": 0.11660825461149216, "learning_rate": 4.187128412678969e-05, "loss": 0.9983, "step": 4350 }, { "epoch": 2.1164021164021163, "grad_norm": 0.11894522607326508, "learning_rate": 3.984062510375155e-05, "loss": 0.9967, "step": 4400 }, { "epoch": 2.1404521404521404, "grad_norm": 0.13249389827251434, "learning_rate": 3.7848122588965144e-05, "loss": 0.9974, "step": 4450 }, { "epoch": 2.1645021645021645, "grad_norm": 0.1497354954481125, "learning_rate": 3.5895040341217543e-05, "loss": 0.999, "step": 4500 }, { "epoch": 2.1885521885521886, "grad_norm": 0.11724729835987091, "learning_rate": 3.398261711671309e-05, "loss": 1.0033, "step": 4550 }, { "epoch": 2.2126022126022127, "grad_norm": 0.11575422435998917, "learning_rate": 3.211206588338358e-05, "loss": 0.9988, "step": 4600 }, { "epoch": 2.236652236652237, "grad_norm": 0.11643755435943604, "learning_rate": 3.028457305155483e-05, "loss": 0.9971, "step": 4650 }, { "epoch": 2.260702260702261, "grad_norm": 0.10507030785083771, "learning_rate": 2.8501297721457422e-05, "loss": 0.9906, "step": 4700 }, { "epoch": 2.284752284752285, "grad_norm": 0.10402993112802505, "learning_rate": 2.6763370948059353e-05, "loss": 1.0007, "step": 4750 }, { "epoch": 2.3088023088023086, "grad_norm": 0.10819538682699203, "learning_rate": 2.5071895023686442e-05, "loss": 0.9969, "step": 4800 }, { "epoch": 2.3328523328523327, "grad_norm": 0.1739010065793991, "learning_rate": 2.342794277888547e-05, "loss": 0.9976, "step": 4850 }, { "epoch": 2.356902356902357, "grad_norm": 0.10811810195446014, "learning_rate": 2.1832556901973965e-05, "loss": 0.9946, "step": 4900 }, { "epoch": 2.380952380952381, "grad_norm": 0.1108924001455307, "learning_rate": 2.0286749277707782e-05, "loss": 0.9906, "step": 4950 }, { "epoch": 2.405002405002405, "grad_norm": 0.10033036023378372, "learning_rate": 1.879150034548588e-05, "loss": 0.9997, "step": 5000 }, { "epoch": 2.429052429052429, "grad_norm": 0.12525063753128052, "learning_rate": 1.7347758477500044e-05, "loss": 0.9885, "step": 5050 }, { "epoch": 2.4531024531024532, "grad_norm": 0.10124919563531876, "learning_rate": 1.5956439377222798e-05, "loss": 0.9964, "step": 5100 }, { "epoch": 2.4771524771524773, "grad_norm": 0.10390116274356842, "learning_rate": 1.4618425498616162e-05, "loss": 0.9893, "step": 5150 }, { "epoch": 2.501202501202501, "grad_norm": 0.1200655922293663, "learning_rate": 1.3334565486428996e-05, "loss": 0.989, "step": 5200 }, { "epoch": 2.525252525252525, "grad_norm": 0.11598347127437592, "learning_rate": 1.2105673637938053e-05, "loss": 0.994, "step": 5250 }, { "epoch": 2.549302549302549, "grad_norm": 0.10302892327308655, "learning_rate": 1.0932529386474188e-05, "loss": 0.9911, "step": 5300 }, { "epoch": 2.5733525733525733, "grad_norm": 0.10021837800741196, "learning_rate": 9.815876807061264e-06, "loss": 1.0002, "step": 5350 }, { "epoch": 2.5974025974025974, "grad_norm": 0.09885919839143753, "learning_rate": 8.756424144481312e-06, "loss": 0.9882, "step": 5400 }, { "epoch": 2.6214526214526215, "grad_norm": 0.10271639376878738, "learning_rate": 7.75484336406529e-06, "loss": 0.9991, "step": 5450 }, { "epoch": 2.6455026455026456, "grad_norm": 0.10511433333158493, "learning_rate": 6.8117697254943106e-06, "loss": 0.9908, "step": 5500 }, { "epoch": 2.6695526695526697, "grad_norm": 0.1004793718457222, "learning_rate": 5.927801379881714e-06, "loss": 0.9977, "step": 5550 }, { "epoch": 2.6936026936026938, "grad_norm": 0.0923864096403122, "learning_rate": 5.103498990391509e-06, "loss": 0.989, "step": 5600 }, { "epoch": 2.717652717652718, "grad_norm": 0.09591204673051834, "learning_rate": 4.339385376633775e-06, "loss": 0.9917, "step": 5650 }, { "epoch": 2.741702741702742, "grad_norm": 0.09710726141929626, "learning_rate": 3.6359451830626723e-06, "loss": 0.9871, "step": 5700 }, { "epoch": 2.7657527657527656, "grad_norm": 0.0920204296708107, "learning_rate": 2.993624571587239e-06, "loss": 0.9878, "step": 5750 }, { "epoch": 2.7898027898027897, "grad_norm": 0.09445653855800629, "learning_rate": 2.4128309385900717e-06, "loss": 0.9886, "step": 5800 }, { "epoch": 2.813852813852814, "grad_norm": 0.0918457955121994, "learning_rate": 1.8939326565333037e-06, "loss": 0.9872, "step": 5850 }, { "epoch": 2.837902837902838, "grad_norm": 0.08985795080661774, "learning_rate": 1.437258840315714e-06, "loss": 0.9942, "step": 5900 }, { "epoch": 2.861952861952862, "grad_norm": 0.08798079937696457, "learning_rate": 1.0430991385293575e-06, "loss": 0.9911, "step": 5950 }, { "epoch": 2.886002886002886, "grad_norm": 0.08569388091564178, "learning_rate": 7.117035497478553e-07, "loss": 0.9932, "step": 6000 }, { "epoch": 2.91005291005291, "grad_norm": 0.08316464722156525, "learning_rate": 4.432822639630407e-07, "loss": 0.9895, "step": 6050 }, { "epoch": 2.934102934102934, "grad_norm": 0.08785533905029297, "learning_rate": 2.380055292704575e-07, "loss": 0.9937, "step": 6100 }, { "epoch": 2.958152958152958, "grad_norm": 0.08768357336521149, "learning_rate": 9.600354388833443e-08, "loss": 0.9922, "step": 6150 }, { "epoch": 2.982202982202982, "grad_norm": 0.08772250264883041, "learning_rate": 1.7366373578442397e-08, "loss": 0.9919, "step": 6200 } ], "logging_steps": 50, "max_steps": 6237, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.355263840216364e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }