|
{ |
|
"best_metric": 1.6663275957107544, |
|
"best_model_checkpoint": "/data/user_data/gonilude/java_and_text_pythia_410m/checkpoint-150", |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 243, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0, |
|
"eval_accuracy": 0.25, |
|
"eval_loss": 2.6309967041015625, |
|
"eval_runtime": 0.5335, |
|
"eval_samples_per_second": 134.957, |
|
"eval_steps_per_second": 16.87, |
|
"num_input_tokens_seen": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.012345679012345678, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 3.7346, |
|
"num_input_tokens_seen": 8192, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06172839506172839, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.0, |
|
"loss": 3.0853, |
|
"num_input_tokens_seen": 40960, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 140.6628875732422, |
|
"learning_rate": 1e-05, |
|
"loss": 2.951, |
|
"num_input_tokens_seen": 81920, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 121.90998840332031, |
|
"learning_rate": 1.9999106432103785e-05, |
|
"loss": 3.6623, |
|
"num_input_tokens_seen": 122880, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 119.89352416992188, |
|
"learning_rate": 1.9967848320275253e-05, |
|
"loss": 2.5551, |
|
"num_input_tokens_seen": 163840, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30864197530864196, |
|
"grad_norm": 69.07218170166016, |
|
"learning_rate": 1.9892071378116378e-05, |
|
"loss": 2.146, |
|
"num_input_tokens_seen": 204800, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 38.68893051147461, |
|
"learning_rate": 1.9772114043845968e-05, |
|
"loss": 2.4626, |
|
"num_input_tokens_seen": 245760, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.43209876543209874, |
|
"grad_norm": 12.328490257263184, |
|
"learning_rate": 1.9608512076038964e-05, |
|
"loss": 1.976, |
|
"num_input_tokens_seen": 286720, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 28.689950942993164, |
|
"learning_rate": 1.9401996160798574e-05, |
|
"loss": 1.8516, |
|
"num_input_tokens_seen": 327680, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 20.923078536987305, |
|
"learning_rate": 1.915348864833476e-05, |
|
"loss": 1.669, |
|
"num_input_tokens_seen": 368640, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 60.305381774902344, |
|
"learning_rate": 1.8864099433524302e-05, |
|
"loss": 1.9603, |
|
"num_input_tokens_seen": 409600, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"eval_accuracy": 0.19444444444444445, |
|
"eval_loss": 1.8224691152572632, |
|
"eval_runtime": 0.3688, |
|
"eval_samples_per_second": 195.251, |
|
"eval_steps_per_second": 24.406, |
|
"num_input_tokens_seen": 409600, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6790123456790124, |
|
"grad_norm": 37.31528854370117, |
|
"learning_rate": 1.853512099885085e-05, |
|
"loss": 1.6856, |
|
"num_input_tokens_seen": 450560, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 34.03938674926758, |
|
"learning_rate": 1.816802264186438e-05, |
|
"loss": 1.989, |
|
"num_input_tokens_seen": 491520, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8024691358024691, |
|
"grad_norm": 27.516170501708984, |
|
"learning_rate": 1.7764443912941675e-05, |
|
"loss": 1.8994, |
|
"num_input_tokens_seen": 532480, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8641975308641975, |
|
"grad_norm": 23.523881912231445, |
|
"learning_rate": 1.7326187292656332e-05, |
|
"loss": 1.7613, |
|
"num_input_tokens_seen": 573440, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 25.84837532043457, |
|
"learning_rate": 1.6855210141462964e-05, |
|
"loss": 1.9531, |
|
"num_input_tokens_seen": 614400, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 34.3070068359375, |
|
"learning_rate": 1.635361595765024e-05, |
|
"loss": 1.9051, |
|
"num_input_tokens_seen": 655360, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0493827160493827, |
|
"grad_norm": 31.36409568786621, |
|
"learning_rate": 1.5823644982606905e-05, |
|
"loss": 1.8392, |
|
"num_input_tokens_seen": 696320, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 27.246265411376953, |
|
"learning_rate": 1.5267664195359917e-05, |
|
"loss": 1.6923, |
|
"num_input_tokens_seen": 737280, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1728395061728394, |
|
"grad_norm": 41.13190841674805, |
|
"learning_rate": 1.4688156741071513e-05, |
|
"loss": 1.8443, |
|
"num_input_tokens_seen": 778240, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 38.65595626831055, |
|
"learning_rate": 1.408771084071012e-05, |
|
"loss": 1.7774, |
|
"num_input_tokens_seen": 819200, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"eval_accuracy": 0.18055555555555555, |
|
"eval_loss": 1.7543673515319824, |
|
"eval_runtime": 0.371, |
|
"eval_samples_per_second": 194.075, |
|
"eval_steps_per_second": 24.259, |
|
"num_input_tokens_seen": 819200, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 14.981557846069336, |
|
"learning_rate": 1.3469008231427207e-05, |
|
"loss": 1.8287, |
|
"num_input_tokens_seen": 860160, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.3580246913580247, |
|
"grad_norm": 21.5086727142334, |
|
"learning_rate": 1.283481218926818e-05, |
|
"loss": 1.5601, |
|
"num_input_tokens_seen": 901120, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4197530864197532, |
|
"grad_norm": 24.150880813598633, |
|
"learning_rate": 1.2187955187710752e-05, |
|
"loss": 1.8594, |
|
"num_input_tokens_seen": 942080, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 11.27273941040039, |
|
"learning_rate": 1.1531326247150802e-05, |
|
"loss": 1.6694, |
|
"num_input_tokens_seen": 983040, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5432098765432098, |
|
"grad_norm": 24.10595703125, |
|
"learning_rate": 1.0867858031835975e-05, |
|
"loss": 1.6684, |
|
"num_input_tokens_seen": 1024000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.6049382716049383, |
|
"grad_norm": 15.398138046264648, |
|
"learning_rate": 1.0200513751875227e-05, |
|
"loss": 1.6126, |
|
"num_input_tokens_seen": 1064960, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 25.667972564697266, |
|
"learning_rate": 9.532273928823151e-06, |
|
"loss": 1.5059, |
|
"num_input_tokens_seen": 1105920, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.7283950617283952, |
|
"grad_norm": 21.25739288330078, |
|
"learning_rate": 8.866123083947182e-06, |
|
"loss": 1.6684, |
|
"num_input_tokens_seen": 1146880, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7901234567901234, |
|
"grad_norm": 17.899890899658203, |
|
"learning_rate": 8.20503640863111e-06, |
|
"loss": 1.5919, |
|
"num_input_tokens_seen": 1187840, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 25.205127716064453, |
|
"learning_rate": 7.55196647644814e-06, |
|
"loss": 1.6193, |
|
"num_input_tokens_seen": 1228800, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"eval_accuracy": 0.18055555555555555, |
|
"eval_loss": 1.6663275957107544, |
|
"eval_runtime": 0.3724, |
|
"eval_samples_per_second": 193.328, |
|
"eval_steps_per_second": 24.166, |
|
"num_input_tokens_seen": 1228800, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9135802469135803, |
|
"grad_norm": 32.01460647583008, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 1.649, |
|
"num_input_tokens_seen": 1269760, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 23.55994415283203, |
|
"learning_rate": 6.2814950851718695e-06, |
|
"loss": 1.7344, |
|
"num_input_tokens_seen": 1310720, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 40.77457046508789, |
|
"learning_rate": 5.669767859723636e-06, |
|
"loss": 1.5739, |
|
"num_input_tokens_seen": 1351680, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.0987654320987654, |
|
"grad_norm": 23.66048240661621, |
|
"learning_rate": 5.077380502193725e-06, |
|
"loss": 1.4769, |
|
"num_input_tokens_seen": 1392640, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.1604938271604937, |
|
"grad_norm": 20.150760650634766, |
|
"learning_rate": 4.5069787583250815e-06, |
|
"loss": 1.5486, |
|
"num_input_tokens_seen": 1433600, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 11.237411499023438, |
|
"learning_rate": 3.961110180772955e-06, |
|
"loss": 1.5313, |
|
"num_input_tokens_seen": 1474560, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.2839506172839505, |
|
"grad_norm": 30.582534790039062, |
|
"learning_rate": 3.442212751116305e-06, |
|
"loss": 1.6429, |
|
"num_input_tokens_seen": 1515520, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.3456790123456788, |
|
"grad_norm": 24.695331573486328, |
|
"learning_rate": 2.9526039912402504e-06, |
|
"loss": 1.4812, |
|
"num_input_tokens_seen": 1556480, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 6.564646244049072, |
|
"learning_rate": 2.4944706127207252e-06, |
|
"loss": 1.5954, |
|
"num_input_tokens_seen": 1597440, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"grad_norm": 31.85268783569336, |
|
"learning_rate": 2.0698587504397684e-06, |
|
"loss": 1.7311, |
|
"num_input_tokens_seen": 1638400, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"eval_accuracy": 0.19444444444444445, |
|
"eval_loss": 1.7398681640625, |
|
"eval_runtime": 0.4238, |
|
"eval_samples_per_second": 169.875, |
|
"eval_steps_per_second": 21.234, |
|
"num_input_tokens_seen": 1638400, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5308641975308643, |
|
"grad_norm": 29.221206665039062, |
|
"learning_rate": 1.680664824050432e-06, |
|
"loss": 1.4484, |
|
"num_input_tokens_seen": 1679360, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 35.87575149536133, |
|
"learning_rate": 1.3286270681062275e-06, |
|
"loss": 1.5614, |
|
"num_input_tokens_seen": 1720320, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6543209876543212, |
|
"grad_norm": 29.08688735961914, |
|
"learning_rate": 1.015317768683669e-06, |
|
"loss": 1.5382, |
|
"num_input_tokens_seen": 1761280, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.7160493827160495, |
|
"grad_norm": 23.70969581604004, |
|
"learning_rate": 7.421362411709676e-07, |
|
"loss": 1.5272, |
|
"num_input_tokens_seen": 1802240, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 35.782222747802734, |
|
"learning_rate": 5.103025805858197e-07, |
|
"loss": 1.5842, |
|
"num_input_tokens_seen": 1843200, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.8395061728395063, |
|
"grad_norm": 31.512516021728516, |
|
"learning_rate": 3.2085221233487564e-07, |
|
"loss": 1.5197, |
|
"num_input_tokens_seen": 1884160, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.9012345679012346, |
|
"grad_norm": 24.439847946166992, |
|
"learning_rate": 1.7463126775252192e-07, |
|
"loss": 1.5678, |
|
"num_input_tokens_seen": 1925120, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 28.297935485839844, |
|
"learning_rate": 7.229280507293657e-08, |
|
"loss": 1.6146, |
|
"num_input_tokens_seen": 1966080, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"num_input_tokens_seen": 1990656, |
|
"step": 243, |
|
"total_flos": 3610849453277184.0, |
|
"train_loss": 1.8222727520475663, |
|
"train_runtime": 109.7362, |
|
"train_samples_per_second": 17.633, |
|
"train_steps_per_second": 2.214 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 243, |
|
"num_input_tokens_seen": 1990656, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3610849453277184.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|