|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.9171597633136095, |
|
"eval_steps": 25, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14792899408284024, |
|
"grad_norm": 0.8603515625, |
|
"learning_rate": 0.0001951951951951952, |
|
"loss": 0.939, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14792899408284024, |
|
"eval_loss": 0.6645179390907288, |
|
"eval_runtime": 5.4254, |
|
"eval_samples_per_second": 16.404, |
|
"eval_steps_per_second": 2.212, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.67138671875, |
|
"learning_rate": 0.0001901901901901902, |
|
"loss": 0.6144, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"eval_loss": 0.6127904653549194, |
|
"eval_runtime": 5.4718, |
|
"eval_samples_per_second": 16.265, |
|
"eval_steps_per_second": 2.193, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4437869822485207, |
|
"grad_norm": 0.7236328125, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 0.6042, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4437869822485207, |
|
"eval_loss": 0.6052118539810181, |
|
"eval_runtime": 5.4747, |
|
"eval_samples_per_second": 16.256, |
|
"eval_steps_per_second": 2.192, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.65283203125, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 0.5929, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.5997007489204407, |
|
"eval_runtime": 5.5068, |
|
"eval_samples_per_second": 16.162, |
|
"eval_steps_per_second": 2.179, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7396449704142012, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001751751751751752, |
|
"loss": 0.5968, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7396449704142012, |
|
"eval_loss": 0.5949457287788391, |
|
"eval_runtime": 5.5046, |
|
"eval_samples_per_second": 16.168, |
|
"eval_steps_per_second": 2.18, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.66357421875, |
|
"learning_rate": 0.0001701701701701702, |
|
"loss": 0.6017, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"eval_loss": 0.5932657122612, |
|
"eval_runtime": 5.4672, |
|
"eval_samples_per_second": 16.279, |
|
"eval_steps_per_second": 2.195, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0355029585798816, |
|
"grad_norm": 0.59228515625, |
|
"learning_rate": 0.00016516516516516518, |
|
"loss": 0.5471, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0355029585798816, |
|
"eval_loss": 0.6293187141418457, |
|
"eval_runtime": 5.5245, |
|
"eval_samples_per_second": 16.11, |
|
"eval_steps_per_second": 2.172, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 0.62158203125, |
|
"learning_rate": 0.00016016016016016018, |
|
"loss": 0.4246, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"eval_loss": 0.6185322403907776, |
|
"eval_runtime": 5.5491, |
|
"eval_samples_per_second": 16.039, |
|
"eval_steps_per_second": 2.163, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.331360946745562, |
|
"grad_norm": 0.71142578125, |
|
"learning_rate": 0.00015515515515515516, |
|
"loss": 0.4311, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.331360946745562, |
|
"eval_loss": 0.6142793893814087, |
|
"eval_runtime": 5.4779, |
|
"eval_samples_per_second": 16.247, |
|
"eval_steps_per_second": 2.191, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00015015015015015014, |
|
"loss": 0.4175, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"eval_loss": 0.6187874674797058, |
|
"eval_runtime": 5.4791, |
|
"eval_samples_per_second": 16.243, |
|
"eval_steps_per_second": 2.19, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6272189349112427, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00014514514514514515, |
|
"loss": 0.4303, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6272189349112427, |
|
"eval_loss": 0.6225253343582153, |
|
"eval_runtime": 5.5347, |
|
"eval_samples_per_second": 16.08, |
|
"eval_steps_per_second": 2.168, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00014014014014014013, |
|
"loss": 0.4271, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"eval_loss": 0.6251070499420166, |
|
"eval_runtime": 5.4796, |
|
"eval_samples_per_second": 16.242, |
|
"eval_steps_per_second": 2.19, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.72998046875, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 0.4248, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"eval_loss": 0.6276720762252808, |
|
"eval_runtime": 5.5272, |
|
"eval_samples_per_second": 16.102, |
|
"eval_steps_per_second": 2.171, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.0710059171597632, |
|
"grad_norm": 0.72705078125, |
|
"learning_rate": 0.00013013013013013014, |
|
"loss": 0.3568, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0710059171597632, |
|
"eval_loss": 0.6846657395362854, |
|
"eval_runtime": 5.4826, |
|
"eval_samples_per_second": 16.233, |
|
"eval_steps_per_second": 2.189, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2189349112426036, |
|
"grad_norm": 0.732421875, |
|
"learning_rate": 0.00012512512512512512, |
|
"loss": 0.2759, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.2189349112426036, |
|
"eval_loss": 0.7119464874267578, |
|
"eval_runtime": 5.5426, |
|
"eval_samples_per_second": 16.057, |
|
"eval_steps_per_second": 2.165, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.366863905325444, |
|
"grad_norm": 0.7099609375, |
|
"learning_rate": 0.00012012012012012013, |
|
"loss": 0.2687, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.366863905325444, |
|
"eval_loss": 0.7088969945907593, |
|
"eval_runtime": 5.5287, |
|
"eval_samples_per_second": 16.098, |
|
"eval_steps_per_second": 2.17, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.5147928994082838, |
|
"grad_norm": 0.86767578125, |
|
"learning_rate": 0.00011511511511511512, |
|
"loss": 0.2796, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.5147928994082838, |
|
"eval_loss": 0.716306209564209, |
|
"eval_runtime": 5.5463, |
|
"eval_samples_per_second": 16.047, |
|
"eval_steps_per_second": 2.164, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.662721893491124, |
|
"grad_norm": 0.77587890625, |
|
"learning_rate": 0.00011011011011011012, |
|
"loss": 0.2735, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.662721893491124, |
|
"eval_loss": 0.7141934037208557, |
|
"eval_runtime": 5.4992, |
|
"eval_samples_per_second": 16.184, |
|
"eval_steps_per_second": 2.182, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.8106508875739644, |
|
"grad_norm": 83.5625, |
|
"learning_rate": 0.00010510510510510511, |
|
"loss": 0.284, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.8106508875739644, |
|
"eval_loss": 0.7146495580673218, |
|
"eval_runtime": 5.5227, |
|
"eval_samples_per_second": 16.115, |
|
"eval_steps_per_second": 2.173, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.9585798816568047, |
|
"grad_norm": 0.732421875, |
|
"learning_rate": 0.00010010010010010012, |
|
"loss": 0.2803, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.9585798816568047, |
|
"eval_loss": 0.7089855074882507, |
|
"eval_runtime": 5.499, |
|
"eval_samples_per_second": 16.185, |
|
"eval_steps_per_second": 2.182, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.106508875739645, |
|
"grad_norm": 0.59716796875, |
|
"learning_rate": 9.50950950950951e-05, |
|
"loss": 0.1915, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.106508875739645, |
|
"eval_loss": 0.8113237619400024, |
|
"eval_runtime": 5.5133, |
|
"eval_samples_per_second": 16.143, |
|
"eval_steps_per_second": 2.177, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.2544378698224854, |
|
"grad_norm": 0.76708984375, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 0.16, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.2544378698224854, |
|
"eval_loss": 0.8327358961105347, |
|
"eval_runtime": 5.4786, |
|
"eval_samples_per_second": 16.245, |
|
"eval_steps_per_second": 2.19, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.4023668639053253, |
|
"grad_norm": 0.7197265625, |
|
"learning_rate": 8.50850850850851e-05, |
|
"loss": 0.1621, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.4023668639053253, |
|
"eval_loss": 0.8468723297119141, |
|
"eval_runtime": 5.5083, |
|
"eval_samples_per_second": 16.157, |
|
"eval_steps_per_second": 2.179, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.5502958579881656, |
|
"grad_norm": 0.75146484375, |
|
"learning_rate": 8.008008008008009e-05, |
|
"loss": 0.163, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.5502958579881656, |
|
"eval_loss": 0.8476194143295288, |
|
"eval_runtime": 5.5014, |
|
"eval_samples_per_second": 16.178, |
|
"eval_steps_per_second": 2.181, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.698224852071006, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.507507507507507e-05, |
|
"loss": 0.1615, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.698224852071006, |
|
"eval_loss": 0.8421955108642578, |
|
"eval_runtime": 5.4752, |
|
"eval_samples_per_second": 16.255, |
|
"eval_steps_per_second": 2.192, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 0.85107421875, |
|
"learning_rate": 7.007007007007007e-05, |
|
"loss": 0.1737, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 0.8518087267875671, |
|
"eval_runtime": 5.5254, |
|
"eval_samples_per_second": 16.107, |
|
"eval_steps_per_second": 2.172, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.994082840236686, |
|
"grad_norm": 0.87841796875, |
|
"learning_rate": 6.506506506506507e-05, |
|
"loss": 0.1685, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.994082840236686, |
|
"eval_loss": 0.857303261756897, |
|
"eval_runtime": 5.491, |
|
"eval_samples_per_second": 16.208, |
|
"eval_steps_per_second": 2.185, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.1420118343195265, |
|
"grad_norm": 0.63134765625, |
|
"learning_rate": 6.0060060060060066e-05, |
|
"loss": 0.0961, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.1420118343195265, |
|
"eval_loss": 0.9935606122016907, |
|
"eval_runtime": 5.4899, |
|
"eval_samples_per_second": 16.212, |
|
"eval_steps_per_second": 2.186, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.289940828402367, |
|
"grad_norm": 0.66650390625, |
|
"learning_rate": 5.505505505505506e-05, |
|
"loss": 0.0874, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.289940828402367, |
|
"eval_loss": 1.0188310146331787, |
|
"eval_runtime": 5.4814, |
|
"eval_samples_per_second": 16.237, |
|
"eval_steps_per_second": 2.189, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.437869822485207, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.005005005005006e-05, |
|
"loss": 0.0891, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.437869822485207, |
|
"eval_loss": 1.0284762382507324, |
|
"eval_runtime": 5.5187, |
|
"eval_samples_per_second": 16.127, |
|
"eval_steps_per_second": 2.174, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.585798816568047, |
|
"grad_norm": 0.6240234375, |
|
"learning_rate": 4.5045045045045046e-05, |
|
"loss": 0.0897, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.585798816568047, |
|
"eval_loss": 1.0269498825073242, |
|
"eval_runtime": 5.4812, |
|
"eval_samples_per_second": 16.237, |
|
"eval_steps_per_second": 2.189, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.733727810650888, |
|
"grad_norm": 0.5556640625, |
|
"learning_rate": 4.0040040040040046e-05, |
|
"loss": 0.0882, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.733727810650888, |
|
"eval_loss": 1.0333030223846436, |
|
"eval_runtime": 5.5654, |
|
"eval_samples_per_second": 15.992, |
|
"eval_steps_per_second": 2.156, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.881656804733728, |
|
"grad_norm": 0.91455078125, |
|
"learning_rate": 3.503503503503503e-05, |
|
"loss": 0.0889, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.881656804733728, |
|
"eval_loss": 1.0527359247207642, |
|
"eval_runtime": 5.4892, |
|
"eval_samples_per_second": 16.214, |
|
"eval_steps_per_second": 2.186, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.029585798816568, |
|
"grad_norm": 0.477294921875, |
|
"learning_rate": 3.0030030030030033e-05, |
|
"loss": 0.0826, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.029585798816568, |
|
"eval_loss": 1.0765188932418823, |
|
"eval_runtime": 5.5207, |
|
"eval_samples_per_second": 16.121, |
|
"eval_steps_per_second": 2.174, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.177514792899408, |
|
"grad_norm": 0.56689453125, |
|
"learning_rate": 2.502502502502503e-05, |
|
"loss": 0.0519, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.177514792899408, |
|
"eval_loss": 1.1578717231750488, |
|
"eval_runtime": 5.4744, |
|
"eval_samples_per_second": 16.258, |
|
"eval_steps_per_second": 2.192, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.325443786982248, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.0020020020020023e-05, |
|
"loss": 0.0513, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.325443786982248, |
|
"eval_loss": 1.1684048175811768, |
|
"eval_runtime": 5.482, |
|
"eval_samples_per_second": 16.235, |
|
"eval_steps_per_second": 2.189, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.4733727810650885, |
|
"grad_norm": 0.58251953125, |
|
"learning_rate": 1.5015015015015016e-05, |
|
"loss": 0.0523, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 5.4733727810650885, |
|
"eval_loss": 1.1905882358551025, |
|
"eval_runtime": 5.4845, |
|
"eval_samples_per_second": 16.228, |
|
"eval_steps_per_second": 2.188, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 5.621301775147929, |
|
"grad_norm": 0.53466796875, |
|
"learning_rate": 1.0010010010010011e-05, |
|
"loss": 0.0496, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.621301775147929, |
|
"eval_loss": 1.1795778274536133, |
|
"eval_runtime": 5.5354, |
|
"eval_samples_per_second": 16.078, |
|
"eval_steps_per_second": 2.168, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 0.432861328125, |
|
"learning_rate": 5.005005005005006e-06, |
|
"loss": 0.0495, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"eval_loss": 1.1850156784057617, |
|
"eval_runtime": 5.5478, |
|
"eval_samples_per_second": 16.042, |
|
"eval_steps_per_second": 2.163, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 5.9171597633136095, |
|
"grad_norm": 0.495849609375, |
|
"learning_rate": 0.0, |
|
"loss": 0.0479, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.9171597633136095, |
|
"eval_loss": 1.18731689453125, |
|
"eval_runtime": 5.4886, |
|
"eval_samples_per_second": 16.215, |
|
"eval_steps_per_second": 2.186, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.9171597633136095, |
|
"step": 1000, |
|
"total_flos": 1.75885655212032e+17, |
|
"train_loss": 0.27937755072116854, |
|
"train_runtime": 1227.8544, |
|
"train_samples_per_second": 3.258, |
|
"train_steps_per_second": 0.814 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 25, |
|
"total_flos": 1.75885655212032e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|