{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.9171597633136095, "eval_steps": 25, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14792899408284024, "grad_norm": 0.8603515625, "learning_rate": 0.0001951951951951952, "loss": 0.939, "step": 25 }, { "epoch": 0.14792899408284024, "eval_loss": 0.6645179390907288, "eval_runtime": 5.4254, "eval_samples_per_second": 16.404, "eval_steps_per_second": 2.212, "step": 25 }, { "epoch": 0.2958579881656805, "grad_norm": 0.67138671875, "learning_rate": 0.0001901901901901902, "loss": 0.6144, "step": 50 }, { "epoch": 0.2958579881656805, "eval_loss": 0.6127904653549194, "eval_runtime": 5.4718, "eval_samples_per_second": 16.265, "eval_steps_per_second": 2.193, "step": 50 }, { "epoch": 0.4437869822485207, "grad_norm": 0.7236328125, "learning_rate": 0.0001851851851851852, "loss": 0.6042, "step": 75 }, { "epoch": 0.4437869822485207, "eval_loss": 0.6052118539810181, "eval_runtime": 5.4747, "eval_samples_per_second": 16.256, "eval_steps_per_second": 2.192, "step": 75 }, { "epoch": 0.591715976331361, "grad_norm": 0.65283203125, "learning_rate": 0.00018018018018018018, "loss": 0.5929, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.5997007489204407, "eval_runtime": 5.5068, "eval_samples_per_second": 16.162, "eval_steps_per_second": 2.179, "step": 100 }, { "epoch": 0.7396449704142012, "grad_norm": 0.828125, "learning_rate": 0.0001751751751751752, "loss": 0.5968, "step": 125 }, { "epoch": 0.7396449704142012, "eval_loss": 0.5949457287788391, "eval_runtime": 5.5046, "eval_samples_per_second": 16.168, "eval_steps_per_second": 2.18, "step": 125 }, { "epoch": 0.8875739644970414, "grad_norm": 0.66357421875, "learning_rate": 0.0001701701701701702, "loss": 0.6017, "step": 150 }, { "epoch": 0.8875739644970414, "eval_loss": 0.5932657122612, "eval_runtime": 5.4672, "eval_samples_per_second": 16.279, "eval_steps_per_second": 2.195, "step": 150 }, { "epoch": 1.0355029585798816, "grad_norm": 0.59228515625, "learning_rate": 0.00016516516516516518, "loss": 0.5471, "step": 175 }, { "epoch": 1.0355029585798816, "eval_loss": 0.6293187141418457, "eval_runtime": 5.5245, "eval_samples_per_second": 16.11, "eval_steps_per_second": 2.172, "step": 175 }, { "epoch": 1.183431952662722, "grad_norm": 0.62158203125, "learning_rate": 0.00016016016016016018, "loss": 0.4246, "step": 200 }, { "epoch": 1.183431952662722, "eval_loss": 0.6185322403907776, "eval_runtime": 5.5491, "eval_samples_per_second": 16.039, "eval_steps_per_second": 2.163, "step": 200 }, { "epoch": 1.331360946745562, "grad_norm": 0.71142578125, "learning_rate": 0.00015515515515515516, "loss": 0.4311, "step": 225 }, { "epoch": 1.331360946745562, "eval_loss": 0.6142793893814087, "eval_runtime": 5.4779, "eval_samples_per_second": 16.247, "eval_steps_per_second": 2.191, "step": 225 }, { "epoch": 1.4792899408284024, "grad_norm": 0.6875, "learning_rate": 0.00015015015015015014, "loss": 0.4175, "step": 250 }, { "epoch": 1.4792899408284024, "eval_loss": 0.6187874674797058, "eval_runtime": 5.4791, "eval_samples_per_second": 16.243, "eval_steps_per_second": 2.19, "step": 250 }, { "epoch": 1.6272189349112427, "grad_norm": 0.76953125, "learning_rate": 0.00014514514514514515, "loss": 0.4303, "step": 275 }, { "epoch": 1.6272189349112427, "eval_loss": 0.6225253343582153, "eval_runtime": 5.5347, "eval_samples_per_second": 16.08, "eval_steps_per_second": 2.168, "step": 275 }, { "epoch": 1.7751479289940828, "grad_norm": 0.6640625, "learning_rate": 0.00014014014014014013, "loss": 0.4271, "step": 300 }, { "epoch": 1.7751479289940828, "eval_loss": 0.6251070499420166, "eval_runtime": 5.4796, "eval_samples_per_second": 16.242, "eval_steps_per_second": 2.19, "step": 300 }, { "epoch": 1.9230769230769231, "grad_norm": 0.72998046875, "learning_rate": 0.00013513513513513514, "loss": 0.4248, "step": 325 }, { "epoch": 1.9230769230769231, "eval_loss": 0.6276720762252808, "eval_runtime": 5.5272, "eval_samples_per_second": 16.102, "eval_steps_per_second": 2.171, "step": 325 }, { "epoch": 2.0710059171597632, "grad_norm": 0.72705078125, "learning_rate": 0.00013013013013013014, "loss": 0.3568, "step": 350 }, { "epoch": 2.0710059171597632, "eval_loss": 0.6846657395362854, "eval_runtime": 5.4826, "eval_samples_per_second": 16.233, "eval_steps_per_second": 2.189, "step": 350 }, { "epoch": 2.2189349112426036, "grad_norm": 0.732421875, "learning_rate": 0.00012512512512512512, "loss": 0.2759, "step": 375 }, { "epoch": 2.2189349112426036, "eval_loss": 0.7119464874267578, "eval_runtime": 5.5426, "eval_samples_per_second": 16.057, "eval_steps_per_second": 2.165, "step": 375 }, { "epoch": 2.366863905325444, "grad_norm": 0.7099609375, "learning_rate": 0.00012012012012012013, "loss": 0.2687, "step": 400 }, { "epoch": 2.366863905325444, "eval_loss": 0.7088969945907593, "eval_runtime": 5.5287, "eval_samples_per_second": 16.098, "eval_steps_per_second": 2.17, "step": 400 }, { "epoch": 2.5147928994082838, "grad_norm": 0.86767578125, "learning_rate": 0.00011511511511511512, "loss": 0.2796, "step": 425 }, { "epoch": 2.5147928994082838, "eval_loss": 0.716306209564209, "eval_runtime": 5.5463, "eval_samples_per_second": 16.047, "eval_steps_per_second": 2.164, "step": 425 }, { "epoch": 2.662721893491124, "grad_norm": 0.77587890625, "learning_rate": 0.00011011011011011012, "loss": 0.2735, "step": 450 }, { "epoch": 2.662721893491124, "eval_loss": 0.7141934037208557, "eval_runtime": 5.4992, "eval_samples_per_second": 16.184, "eval_steps_per_second": 2.182, "step": 450 }, { "epoch": 2.8106508875739644, "grad_norm": 83.5625, "learning_rate": 0.00010510510510510511, "loss": 0.284, "step": 475 }, { "epoch": 2.8106508875739644, "eval_loss": 0.7146495580673218, "eval_runtime": 5.5227, "eval_samples_per_second": 16.115, "eval_steps_per_second": 2.173, "step": 475 }, { "epoch": 2.9585798816568047, "grad_norm": 0.732421875, "learning_rate": 0.00010010010010010012, "loss": 0.2803, "step": 500 }, { "epoch": 2.9585798816568047, "eval_loss": 0.7089855074882507, "eval_runtime": 5.499, "eval_samples_per_second": 16.185, "eval_steps_per_second": 2.182, "step": 500 }, { "epoch": 3.106508875739645, "grad_norm": 0.59716796875, "learning_rate": 9.50950950950951e-05, "loss": 0.1915, "step": 525 }, { "epoch": 3.106508875739645, "eval_loss": 0.8113237619400024, "eval_runtime": 5.5133, "eval_samples_per_second": 16.143, "eval_steps_per_second": 2.177, "step": 525 }, { "epoch": 3.2544378698224854, "grad_norm": 0.76708984375, "learning_rate": 9.009009009009009e-05, "loss": 0.16, "step": 550 }, { "epoch": 3.2544378698224854, "eval_loss": 0.8327358961105347, "eval_runtime": 5.4786, "eval_samples_per_second": 16.245, "eval_steps_per_second": 2.19, "step": 550 }, { "epoch": 3.4023668639053253, "grad_norm": 0.7197265625, "learning_rate": 8.50850850850851e-05, "loss": 0.1621, "step": 575 }, { "epoch": 3.4023668639053253, "eval_loss": 0.8468723297119141, "eval_runtime": 5.5083, "eval_samples_per_second": 16.157, "eval_steps_per_second": 2.179, "step": 575 }, { "epoch": 3.5502958579881656, "grad_norm": 0.75146484375, "learning_rate": 8.008008008008009e-05, "loss": 0.163, "step": 600 }, { "epoch": 3.5502958579881656, "eval_loss": 0.8476194143295288, "eval_runtime": 5.5014, "eval_samples_per_second": 16.178, "eval_steps_per_second": 2.181, "step": 600 }, { "epoch": 3.698224852071006, "grad_norm": 0.921875, "learning_rate": 7.507507507507507e-05, "loss": 0.1615, "step": 625 }, { "epoch": 3.698224852071006, "eval_loss": 0.8421955108642578, "eval_runtime": 5.4752, "eval_samples_per_second": 16.255, "eval_steps_per_second": 2.192, "step": 625 }, { "epoch": 3.8461538461538463, "grad_norm": 0.85107421875, "learning_rate": 7.007007007007007e-05, "loss": 0.1737, "step": 650 }, { "epoch": 3.8461538461538463, "eval_loss": 0.8518087267875671, "eval_runtime": 5.5254, "eval_samples_per_second": 16.107, "eval_steps_per_second": 2.172, "step": 650 }, { "epoch": 3.994082840236686, "grad_norm": 0.87841796875, "learning_rate": 6.506506506506507e-05, "loss": 0.1685, "step": 675 }, { "epoch": 3.994082840236686, "eval_loss": 0.857303261756897, "eval_runtime": 5.491, "eval_samples_per_second": 16.208, "eval_steps_per_second": 2.185, "step": 675 }, { "epoch": 4.1420118343195265, "grad_norm": 0.63134765625, "learning_rate": 6.0060060060060066e-05, "loss": 0.0961, "step": 700 }, { "epoch": 4.1420118343195265, "eval_loss": 0.9935606122016907, "eval_runtime": 5.4899, "eval_samples_per_second": 16.212, "eval_steps_per_second": 2.186, "step": 700 }, { "epoch": 4.289940828402367, "grad_norm": 0.66650390625, "learning_rate": 5.505505505505506e-05, "loss": 0.0874, "step": 725 }, { "epoch": 4.289940828402367, "eval_loss": 1.0188310146331787, "eval_runtime": 5.4814, "eval_samples_per_second": 16.237, "eval_steps_per_second": 2.189, "step": 725 }, { "epoch": 4.437869822485207, "grad_norm": 0.69921875, "learning_rate": 5.005005005005006e-05, "loss": 0.0891, "step": 750 }, { "epoch": 4.437869822485207, "eval_loss": 1.0284762382507324, "eval_runtime": 5.5187, "eval_samples_per_second": 16.127, "eval_steps_per_second": 2.174, "step": 750 }, { "epoch": 4.585798816568047, "grad_norm": 0.6240234375, "learning_rate": 4.5045045045045046e-05, "loss": 0.0897, "step": 775 }, { "epoch": 4.585798816568047, "eval_loss": 1.0269498825073242, "eval_runtime": 5.4812, "eval_samples_per_second": 16.237, "eval_steps_per_second": 2.189, "step": 775 }, { "epoch": 4.733727810650888, "grad_norm": 0.5556640625, "learning_rate": 4.0040040040040046e-05, "loss": 0.0882, "step": 800 }, { "epoch": 4.733727810650888, "eval_loss": 1.0333030223846436, "eval_runtime": 5.5654, "eval_samples_per_second": 15.992, "eval_steps_per_second": 2.156, "step": 800 }, { "epoch": 4.881656804733728, "grad_norm": 0.91455078125, "learning_rate": 3.503503503503503e-05, "loss": 0.0889, "step": 825 }, { "epoch": 4.881656804733728, "eval_loss": 1.0527359247207642, "eval_runtime": 5.4892, "eval_samples_per_second": 16.214, "eval_steps_per_second": 2.186, "step": 825 }, { "epoch": 5.029585798816568, "grad_norm": 0.477294921875, "learning_rate": 3.0030030030030033e-05, "loss": 0.0826, "step": 850 }, { "epoch": 5.029585798816568, "eval_loss": 1.0765188932418823, "eval_runtime": 5.5207, "eval_samples_per_second": 16.121, "eval_steps_per_second": 2.174, "step": 850 }, { "epoch": 5.177514792899408, "grad_norm": 0.56689453125, "learning_rate": 2.502502502502503e-05, "loss": 0.0519, "step": 875 }, { "epoch": 5.177514792899408, "eval_loss": 1.1578717231750488, "eval_runtime": 5.4744, "eval_samples_per_second": 16.258, "eval_steps_per_second": 2.192, "step": 875 }, { "epoch": 5.325443786982248, "grad_norm": 0.447265625, "learning_rate": 2.0020020020020023e-05, "loss": 0.0513, "step": 900 }, { "epoch": 5.325443786982248, "eval_loss": 1.1684048175811768, "eval_runtime": 5.482, "eval_samples_per_second": 16.235, "eval_steps_per_second": 2.189, "step": 900 }, { "epoch": 5.4733727810650885, "grad_norm": 0.58251953125, "learning_rate": 1.5015015015015016e-05, "loss": 0.0523, "step": 925 }, { "epoch": 5.4733727810650885, "eval_loss": 1.1905882358551025, "eval_runtime": 5.4845, "eval_samples_per_second": 16.228, "eval_steps_per_second": 2.188, "step": 925 }, { "epoch": 5.621301775147929, "grad_norm": 0.53466796875, "learning_rate": 1.0010010010010011e-05, "loss": 0.0496, "step": 950 }, { "epoch": 5.621301775147929, "eval_loss": 1.1795778274536133, "eval_runtime": 5.5354, "eval_samples_per_second": 16.078, "eval_steps_per_second": 2.168, "step": 950 }, { "epoch": 5.769230769230769, "grad_norm": 0.432861328125, "learning_rate": 5.005005005005006e-06, "loss": 0.0495, "step": 975 }, { "epoch": 5.769230769230769, "eval_loss": 1.1850156784057617, "eval_runtime": 5.5478, "eval_samples_per_second": 16.042, "eval_steps_per_second": 2.163, "step": 975 }, { "epoch": 5.9171597633136095, "grad_norm": 0.495849609375, "learning_rate": 0.0, "loss": 0.0479, "step": 1000 }, { "epoch": 5.9171597633136095, "eval_loss": 1.18731689453125, "eval_runtime": 5.4886, "eval_samples_per_second": 16.215, "eval_steps_per_second": 2.186, "step": 1000 }, { "epoch": 5.9171597633136095, "step": 1000, "total_flos": 1.75885655212032e+17, "train_loss": 0.27937755072116854, "train_runtime": 1227.8544, "train_samples_per_second": 3.258, "train_steps_per_second": 0.814 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "total_flos": 1.75885655212032e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }