{ "best_metric": 1.6663275957107544, "best_model_checkpoint": "/data/user_data/gonilude/java_and_text_pythia_410m/checkpoint-150", "epoch": 3.0, "eval_steps": 50, "global_step": 243, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_accuracy": 0.25, "eval_loss": 2.6309967041015625, "eval_runtime": 0.5335, "eval_samples_per_second": 134.957, "eval_steps_per_second": 16.87, "num_input_tokens_seen": 0, "step": 0 }, { "epoch": 0.012345679012345678, "grad_norm": NaN, "learning_rate": 0.0, "loss": 3.7346, "num_input_tokens_seen": 8192, "step": 1 }, { "epoch": 0.06172839506172839, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 3.0853, "num_input_tokens_seen": 40960, "step": 5 }, { "epoch": 0.12345679012345678, "grad_norm": 140.6628875732422, "learning_rate": 1e-05, "loss": 2.951, "num_input_tokens_seen": 81920, "step": 10 }, { "epoch": 0.18518518518518517, "grad_norm": 121.90998840332031, "learning_rate": 1.9999106432103785e-05, "loss": 3.6623, "num_input_tokens_seen": 122880, "step": 15 }, { "epoch": 0.24691358024691357, "grad_norm": 119.89352416992188, "learning_rate": 1.9967848320275253e-05, "loss": 2.5551, "num_input_tokens_seen": 163840, "step": 20 }, { "epoch": 0.30864197530864196, "grad_norm": 69.07218170166016, "learning_rate": 1.9892071378116378e-05, "loss": 2.146, "num_input_tokens_seen": 204800, "step": 25 }, { "epoch": 0.37037037037037035, "grad_norm": 38.68893051147461, "learning_rate": 1.9772114043845968e-05, "loss": 2.4626, "num_input_tokens_seen": 245760, "step": 30 }, { "epoch": 0.43209876543209874, "grad_norm": 12.328490257263184, "learning_rate": 1.9608512076038964e-05, "loss": 1.976, "num_input_tokens_seen": 286720, "step": 35 }, { "epoch": 0.49382716049382713, "grad_norm": 28.689950942993164, "learning_rate": 1.9401996160798574e-05, "loss": 1.8516, "num_input_tokens_seen": 327680, "step": 40 }, { "epoch": 0.5555555555555556, "grad_norm": 20.923078536987305, "learning_rate": 1.915348864833476e-05, "loss": 1.669, "num_input_tokens_seen": 368640, "step": 45 }, { "epoch": 0.6172839506172839, "grad_norm": 60.305381774902344, "learning_rate": 1.8864099433524302e-05, "loss": 1.9603, "num_input_tokens_seen": 409600, "step": 50 }, { "epoch": 0.6172839506172839, "eval_accuracy": 0.19444444444444445, "eval_loss": 1.8224691152572632, "eval_runtime": 0.3688, "eval_samples_per_second": 195.251, "eval_steps_per_second": 24.406, "num_input_tokens_seen": 409600, "step": 50 }, { "epoch": 0.6790123456790124, "grad_norm": 37.31528854370117, "learning_rate": 1.853512099885085e-05, "loss": 1.6856, "num_input_tokens_seen": 450560, "step": 55 }, { "epoch": 0.7407407407407407, "grad_norm": 34.03938674926758, "learning_rate": 1.816802264186438e-05, "loss": 1.989, "num_input_tokens_seen": 491520, "step": 60 }, { "epoch": 0.8024691358024691, "grad_norm": 27.516170501708984, "learning_rate": 1.7764443912941675e-05, "loss": 1.8994, "num_input_tokens_seen": 532480, "step": 65 }, { "epoch": 0.8641975308641975, "grad_norm": 23.523881912231445, "learning_rate": 1.7326187292656332e-05, "loss": 1.7613, "num_input_tokens_seen": 573440, "step": 70 }, { "epoch": 0.9259259259259259, "grad_norm": 25.84837532043457, "learning_rate": 1.6855210141462964e-05, "loss": 1.9531, "num_input_tokens_seen": 614400, "step": 75 }, { "epoch": 0.9876543209876543, "grad_norm": 34.3070068359375, "learning_rate": 1.635361595765024e-05, "loss": 1.9051, "num_input_tokens_seen": 655360, "step": 80 }, { "epoch": 1.0493827160493827, "grad_norm": 31.36409568786621, "learning_rate": 1.5823644982606905e-05, "loss": 1.8392, "num_input_tokens_seen": 696320, "step": 85 }, { "epoch": 1.1111111111111112, "grad_norm": 27.246265411376953, "learning_rate": 1.5267664195359917e-05, "loss": 1.6923, "num_input_tokens_seen": 737280, "step": 90 }, { "epoch": 1.1728395061728394, "grad_norm": 41.13190841674805, "learning_rate": 1.4688156741071513e-05, "loss": 1.8443, "num_input_tokens_seen": 778240, "step": 95 }, { "epoch": 1.2345679012345678, "grad_norm": 38.65595626831055, "learning_rate": 1.408771084071012e-05, "loss": 1.7774, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 1.2345679012345678, "eval_accuracy": 0.18055555555555555, "eval_loss": 1.7543673515319824, "eval_runtime": 0.371, "eval_samples_per_second": 194.075, "eval_steps_per_second": 24.259, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 1.2962962962962963, "grad_norm": 14.981557846069336, "learning_rate": 1.3469008231427207e-05, "loss": 1.8287, "num_input_tokens_seen": 860160, "step": 105 }, { "epoch": 1.3580246913580247, "grad_norm": 21.5086727142334, "learning_rate": 1.283481218926818e-05, "loss": 1.5601, "num_input_tokens_seen": 901120, "step": 110 }, { "epoch": 1.4197530864197532, "grad_norm": 24.150880813598633, "learning_rate": 1.2187955187710752e-05, "loss": 1.8594, "num_input_tokens_seen": 942080, "step": 115 }, { "epoch": 1.4814814814814814, "grad_norm": 11.27273941040039, "learning_rate": 1.1531326247150802e-05, "loss": 1.6694, "num_input_tokens_seen": 983040, "step": 120 }, { "epoch": 1.5432098765432098, "grad_norm": 24.10595703125, "learning_rate": 1.0867858031835975e-05, "loss": 1.6684, "num_input_tokens_seen": 1024000, "step": 125 }, { "epoch": 1.6049382716049383, "grad_norm": 15.398138046264648, "learning_rate": 1.0200513751875227e-05, "loss": 1.6126, "num_input_tokens_seen": 1064960, "step": 130 }, { "epoch": 1.6666666666666665, "grad_norm": 25.667972564697266, "learning_rate": 9.532273928823151e-06, "loss": 1.5059, "num_input_tokens_seen": 1105920, "step": 135 }, { "epoch": 1.7283950617283952, "grad_norm": 21.25739288330078, "learning_rate": 8.866123083947182e-06, "loss": 1.6684, "num_input_tokens_seen": 1146880, "step": 140 }, { "epoch": 1.7901234567901234, "grad_norm": 17.899890899658203, "learning_rate": 8.20503640863111e-06, "loss": 1.5919, "num_input_tokens_seen": 1187840, "step": 145 }, { "epoch": 1.8518518518518519, "grad_norm": 25.205127716064453, "learning_rate": 7.55196647644814e-06, "loss": 1.6193, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 1.8518518518518519, "eval_accuracy": 0.18055555555555555, "eval_loss": 1.6663275957107544, "eval_runtime": 0.3724, "eval_samples_per_second": 193.328, "eval_steps_per_second": 24.166, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 1.9135802469135803, "grad_norm": 32.01460647583008, "learning_rate": 6.909830056250527e-06, "loss": 1.649, "num_input_tokens_seen": 1269760, "step": 155 }, { "epoch": 1.9753086419753085, "grad_norm": 23.55994415283203, "learning_rate": 6.2814950851718695e-06, "loss": 1.7344, "num_input_tokens_seen": 1310720, "step": 160 }, { "epoch": 2.037037037037037, "grad_norm": 40.77457046508789, "learning_rate": 5.669767859723636e-06, "loss": 1.5739, "num_input_tokens_seen": 1351680, "step": 165 }, { "epoch": 2.0987654320987654, "grad_norm": 23.66048240661621, "learning_rate": 5.077380502193725e-06, "loss": 1.4769, "num_input_tokens_seen": 1392640, "step": 170 }, { "epoch": 2.1604938271604937, "grad_norm": 20.150760650634766, "learning_rate": 4.5069787583250815e-06, "loss": 1.5486, "num_input_tokens_seen": 1433600, "step": 175 }, { "epoch": 2.2222222222222223, "grad_norm": 11.237411499023438, "learning_rate": 3.961110180772955e-06, "loss": 1.5313, "num_input_tokens_seen": 1474560, "step": 180 }, { "epoch": 2.2839506172839505, "grad_norm": 30.582534790039062, "learning_rate": 3.442212751116305e-06, "loss": 1.6429, "num_input_tokens_seen": 1515520, "step": 185 }, { "epoch": 2.3456790123456788, "grad_norm": 24.695331573486328, "learning_rate": 2.9526039912402504e-06, "loss": 1.4812, "num_input_tokens_seen": 1556480, "step": 190 }, { "epoch": 2.4074074074074074, "grad_norm": 6.564646244049072, "learning_rate": 2.4944706127207252e-06, "loss": 1.5954, "num_input_tokens_seen": 1597440, "step": 195 }, { "epoch": 2.4691358024691357, "grad_norm": 31.85268783569336, "learning_rate": 2.0698587504397684e-06, "loss": 1.7311, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 2.4691358024691357, "eval_accuracy": 0.19444444444444445, "eval_loss": 1.7398681640625, "eval_runtime": 0.4238, "eval_samples_per_second": 169.875, "eval_steps_per_second": 21.234, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 2.5308641975308643, "grad_norm": 29.221206665039062, "learning_rate": 1.680664824050432e-06, "loss": 1.4484, "num_input_tokens_seen": 1679360, "step": 205 }, { "epoch": 2.5925925925925926, "grad_norm": 35.87575149536133, "learning_rate": 1.3286270681062275e-06, "loss": 1.5614, "num_input_tokens_seen": 1720320, "step": 210 }, { "epoch": 2.6543209876543212, "grad_norm": 29.08688735961914, "learning_rate": 1.015317768683669e-06, "loss": 1.5382, "num_input_tokens_seen": 1761280, "step": 215 }, { "epoch": 2.7160493827160495, "grad_norm": 23.70969581604004, "learning_rate": 7.421362411709676e-07, "loss": 1.5272, "num_input_tokens_seen": 1802240, "step": 220 }, { "epoch": 2.7777777777777777, "grad_norm": 35.782222747802734, "learning_rate": 5.103025805858197e-07, "loss": 1.5842, "num_input_tokens_seen": 1843200, "step": 225 }, { "epoch": 2.8395061728395063, "grad_norm": 31.512516021728516, "learning_rate": 3.2085221233487564e-07, "loss": 1.5197, "num_input_tokens_seen": 1884160, "step": 230 }, { "epoch": 2.9012345679012346, "grad_norm": 24.439847946166992, "learning_rate": 1.7463126775252192e-07, "loss": 1.5678, "num_input_tokens_seen": 1925120, "step": 235 }, { "epoch": 2.962962962962963, "grad_norm": 28.297935485839844, "learning_rate": 7.229280507293657e-08, "loss": 1.6146, "num_input_tokens_seen": 1966080, "step": 240 }, { "epoch": 3.0, "num_input_tokens_seen": 1990656, "step": 243, "total_flos": 3610849453277184.0, "train_loss": 1.8222727520475663, "train_runtime": 109.7362, "train_samples_per_second": 17.633, "train_steps_per_second": 2.214 } ], "logging_steps": 5, "max_steps": 243, "num_input_tokens_seen": 1990656, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3610849453277184.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }