|
{ |
|
"best_metric": 0.21073441207408905, |
|
"best_model_checkpoint": "ckpt/llama2_13b_fuze27_no_sys/object_counting_no_sys/checkpoint-400", |
|
"epoch": 3.6842105263157894, |
|
"eval_steps": 100, |
|
"global_step": 700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05263157894736842, |
|
"grad_norm": 6.26442289352417, |
|
"learning_rate": 5e-05, |
|
"loss": 10.645, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 4.90908145904541, |
|
"learning_rate": 0.0001, |
|
"loss": 4.9002, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15789473684210525, |
|
"grad_norm": 7.5457682609558105, |
|
"learning_rate": 9.99714745464859e-05, |
|
"loss": 0.9012, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 9.139291763305664, |
|
"learning_rate": 9.988593073400354e-05, |
|
"loss": 0.7022, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 5.00139856338501, |
|
"learning_rate": 9.974346616959476e-05, |
|
"loss": 0.7425, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 6.002871990203857, |
|
"learning_rate": 9.954424340791196e-05, |
|
"loss": 0.7609, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3684210526315789, |
|
"grad_norm": 6.101377010345459, |
|
"learning_rate": 9.928848976574019e-05, |
|
"loss": 0.6316, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 5.7651824951171875, |
|
"learning_rate": 9.897649706262473e-05, |
|
"loss": 0.6406, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.47368421052631576, |
|
"grad_norm": 2.46134614944458, |
|
"learning_rate": 9.860862128789953e-05, |
|
"loss": 0.5597, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 4.3927178382873535, |
|
"learning_rate": 9.818528219449705e-05, |
|
"loss": 0.477, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"eval_loss": 0.46219402551651, |
|
"eval_runtime": 0.6936, |
|
"eval_samples_per_second": 57.669, |
|
"eval_steps_per_second": 14.417, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5789473684210527, |
|
"grad_norm": 6.058330059051514, |
|
"learning_rate": 9.770696282000244e-05, |
|
"loss": 0.4798, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 4.410764217376709, |
|
"learning_rate": 9.717420893549902e-05, |
|
"loss": 0.5097, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6842105263157895, |
|
"grad_norm": 1.7051620483398438, |
|
"learning_rate": 9.658762842283343e-05, |
|
"loss": 0.6507, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 3.638519763946533, |
|
"learning_rate": 9.594789058101153e-05, |
|
"loss": 0.4961, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 1.8080860376358032, |
|
"learning_rate": 9.525572536251607e-05, |
|
"loss": 0.3643, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 4.843872547149658, |
|
"learning_rate": 9.451192254041758e-05, |
|
"loss": 0.5859, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8947368421052632, |
|
"grad_norm": 7.292293548583984, |
|
"learning_rate": 9.371733080722911e-05, |
|
"loss": 0.4765, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 2.1483707427978516, |
|
"learning_rate": 9.287285680653254e-05, |
|
"loss": 0.4496, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.2343636751174927, |
|
"learning_rate": 9.197946409848194e-05, |
|
"loss": 0.42, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 3.902963638305664, |
|
"learning_rate": 9.103817206036382e-05, |
|
"loss": 0.4647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"eval_loss": 0.2513052821159363, |
|
"eval_runtime": 0.6934, |
|
"eval_samples_per_second": 57.69, |
|
"eval_steps_per_second": 14.423, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1052631578947367, |
|
"grad_norm": 3.0811166763305664, |
|
"learning_rate": 9.005005472346924e-05, |
|
"loss": 0.3552, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 12.97741413116455, |
|
"learning_rate": 8.90162395476046e-05, |
|
"loss": 0.5985, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2105263157894737, |
|
"grad_norm": 4.255702495574951, |
|
"learning_rate": 8.793790613463955e-05, |
|
"loss": 0.6665, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 3.5546047687530518, |
|
"learning_rate": 8.681628488255986e-05, |
|
"loss": 0.4249, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 9.395575523376465, |
|
"learning_rate": 8.565265558156101e-05, |
|
"loss": 0.4735, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 3.7864913940429688, |
|
"learning_rate": 8.444834595378434e-05, |
|
"loss": 0.265, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4210526315789473, |
|
"grad_norm": 6.043397426605225, |
|
"learning_rate": 8.320473013836196e-05, |
|
"loss": 0.5925, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 5.700586795806885, |
|
"learning_rate": 8.192322712349917e-05, |
|
"loss": 0.4528, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.526315789473684, |
|
"grad_norm": 2.9391582012176514, |
|
"learning_rate": 8.060529912738315e-05, |
|
"loss": 0.3558, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 1.8771816492080688, |
|
"learning_rate": 7.925244992976538e-05, |
|
"loss": 0.5231, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"eval_loss": 0.34315773844718933, |
|
"eval_runtime": 0.6944, |
|
"eval_samples_per_second": 57.607, |
|
"eval_steps_per_second": 14.402, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 4.8421478271484375, |
|
"learning_rate": 7.786622315612183e-05, |
|
"loss": 0.3751, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 11.370196342468262, |
|
"learning_rate": 7.644820051634812e-05, |
|
"loss": 0.3844, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 3.433976173400879, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.5589, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 6.323594570159912, |
|
"learning_rate": 7.35232740301378e-05, |
|
"loss": 0.2688, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 0.5850329399108887, |
|
"learning_rate": 7.201970757788172e-05, |
|
"loss": 0.2528, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 1.9679738283157349, |
|
"learning_rate": 7.049101623982937e-05, |
|
"loss": 0.2398, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9473684210526314, |
|
"grad_norm": 12.53429889678955, |
|
"learning_rate": 6.89389442805288e-05, |
|
"loss": 0.4979, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 7.08478307723999, |
|
"learning_rate": 6.736526264224101e-05, |
|
"loss": 0.5478, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.0526315789473686, |
|
"grad_norm": 9.115711212158203, |
|
"learning_rate": 6.577176692426279e-05, |
|
"loss": 0.4545, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 6.18236780166626, |
|
"learning_rate": 6.416027533411519e-05, |
|
"loss": 0.4381, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"eval_loss": 0.21073441207408905, |
|
"eval_runtime": 0.6912, |
|
"eval_samples_per_second": 57.873, |
|
"eval_steps_per_second": 14.468, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1578947368421053, |
|
"grad_norm": 1.294372797012329, |
|
"learning_rate": 6.253262661293604e-05, |
|
"loss": 0.2746, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 9.00351619720459, |
|
"learning_rate": 6.0890677937442574e-05, |
|
"loss": 0.2276, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.263157894736842, |
|
"grad_norm": 10.986132621765137, |
|
"learning_rate": 5.923630280085948e-05, |
|
"loss": 0.2474, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 12.219366073608398, |
|
"learning_rate": 5.757138887522884e-05, |
|
"loss": 0.2644, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.3684210526315788, |
|
"grad_norm": 2.5177173614501953, |
|
"learning_rate": 5.5897835857542317e-05, |
|
"loss": 0.3294, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 6.946788787841797, |
|
"learning_rate": 5.4217553302152237e-05, |
|
"loss": 0.5553, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.473684210526316, |
|
"grad_norm": 3.0805630683898926, |
|
"learning_rate": 5.2532458441935636e-05, |
|
"loss": 0.1107, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 3.5907771587371826, |
|
"learning_rate": 5.084447400069655e-05, |
|
"loss": 0.2507, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.5789473684210527, |
|
"grad_norm": 1.5780644416809082, |
|
"learning_rate": 4.915552599930345e-05, |
|
"loss": 0.1839, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 2.739733934402466, |
|
"learning_rate": 4.746754155806437e-05, |
|
"loss": 0.2159, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"eval_loss": 0.36658158898353577, |
|
"eval_runtime": 0.6913, |
|
"eval_samples_per_second": 57.864, |
|
"eval_steps_per_second": 14.466, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.6842105263157894, |
|
"grad_norm": 1.3136796951293945, |
|
"learning_rate": 4.578244669784777e-05, |
|
"loss": 0.1742, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 0.0468708761036396, |
|
"learning_rate": 4.410216414245771e-05, |
|
"loss": 0.3915, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.7894736842105265, |
|
"grad_norm": 12.833720207214355, |
|
"learning_rate": 4.2428611124771184e-05, |
|
"loss": 0.2055, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 0.10741530358791351, |
|
"learning_rate": 4.076369719914055e-05, |
|
"loss": 0.3395, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.8947368421052633, |
|
"grad_norm": 6.424005508422852, |
|
"learning_rate": 3.9109322062557424e-05, |
|
"loss": 0.2351, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 7.805177211761475, |
|
"learning_rate": 3.746737338706397e-05, |
|
"loss": 0.1548, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.9612077474594116, |
|
"learning_rate": 3.58397246658848e-05, |
|
"loss": 0.1734, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.0526315789473686, |
|
"grad_norm": 6.020046234130859, |
|
"learning_rate": 3.422823307573722e-05, |
|
"loss": 0.1015, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.1052631578947367, |
|
"grad_norm": 5.7347307205200195, |
|
"learning_rate": 3.263473735775899e-05, |
|
"loss": 0.1705, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 15.45874309539795, |
|
"learning_rate": 3.10610557194712e-05, |
|
"loss": 0.1934, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"eval_loss": 0.26252931356430054, |
|
"eval_runtime": 0.6908, |
|
"eval_samples_per_second": 57.906, |
|
"eval_steps_per_second": 14.476, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.2105263157894735, |
|
"grad_norm": 1.443434715270996, |
|
"learning_rate": 2.950898376017064e-05, |
|
"loss": 0.1024, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.263157894736842, |
|
"grad_norm": 8.99201488494873, |
|
"learning_rate": 2.798029242211828e-05, |
|
"loss": 0.1786, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.3157894736842106, |
|
"grad_norm": 3.1811983585357666, |
|
"learning_rate": 2.6476725969862227e-05, |
|
"loss": 0.1907, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 16.603193283081055, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.125, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.4210526315789473, |
|
"grad_norm": 6.346674919128418, |
|
"learning_rate": 2.3551799483651894e-05, |
|
"loss": 0.1509, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.473684210526316, |
|
"grad_norm": 3.2951841354370117, |
|
"learning_rate": 2.2133776843878186e-05, |
|
"loss": 0.1502, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.526315789473684, |
|
"grad_norm": 4.906289100646973, |
|
"learning_rate": 2.074755007023461e-05, |
|
"loss": 0.1458, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.5789473684210527, |
|
"grad_norm": 0.9275074005126953, |
|
"learning_rate": 1.9394700872616855e-05, |
|
"loss": 0.2181, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.6315789473684212, |
|
"grad_norm": 0.8629674315452576, |
|
"learning_rate": 1.807677287650083e-05, |
|
"loss": 0.0979, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 6.139326095581055, |
|
"learning_rate": 1.6795269861638042e-05, |
|
"loss": 0.2701, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"eval_loss": 0.2853814661502838, |
|
"eval_runtime": 0.691, |
|
"eval_samples_per_second": 57.884, |
|
"eval_steps_per_second": 14.471, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"step": 700, |
|
"total_flos": 1.461153753759744e+16, |
|
"train_loss": 0.5859067852156503, |
|
"train_runtime": 192.1858, |
|
"train_samples_per_second": 19.773, |
|
"train_steps_per_second": 4.943 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 950, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"total_flos": 1.461153753759744e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|