{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006492347145801886, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.492347145801887e-05, "eval_loss": 1.2574450969696045, "eval_runtime": 2164.1177, "eval_samples_per_second": 11.987, "eval_steps_per_second": 1.499, "step": 1 }, { "epoch": 0.00019477041437405659, "grad_norm": 0.21717190742492676, "learning_rate": 3e-05, "loss": 1.1457, "step": 3 }, { "epoch": 0.00038954082874811317, "grad_norm": 0.1491047739982605, "learning_rate": 6e-05, "loss": 1.2315, "step": 6 }, { "epoch": 0.0005843112431221698, "grad_norm": 0.18038997054100037, "learning_rate": 9e-05, "loss": 1.1694, "step": 9 }, { "epoch": 0.0005843112431221698, "eval_loss": 1.246630072593689, "eval_runtime": 2177.4551, "eval_samples_per_second": 11.914, "eval_steps_per_second": 1.489, "step": 9 }, { "epoch": 0.0007790816574962263, "grad_norm": 0.2281239777803421, "learning_rate": 9.987820251299122e-05, "loss": 1.3245, "step": 12 }, { "epoch": 0.0009738520718702829, "grad_norm": 0.24129489064216614, "learning_rate": 9.924038765061042e-05, "loss": 1.2447, "step": 15 }, { "epoch": 0.0011686224862443395, "grad_norm": 0.20848846435546875, "learning_rate": 9.806308479691595e-05, "loss": 1.3351, "step": 18 }, { "epoch": 0.0011686224862443395, "eval_loss": 1.1892844438552856, "eval_runtime": 2174.5469, "eval_samples_per_second": 11.93, "eval_steps_per_second": 1.491, "step": 18 }, { "epoch": 0.001363392900618396, "grad_norm": 0.22571416199207306, "learning_rate": 9.635919272833938e-05, "loss": 1.1832, "step": 21 }, { "epoch": 0.0015581633149924527, "grad_norm": 0.24647393822669983, "learning_rate": 9.414737964294636e-05, "loss": 1.3137, "step": 24 }, { "epoch": 0.0017529337293665092, "grad_norm": 0.2367832213640213, "learning_rate": 9.145187862775209e-05, "loss": 1.24, "step": 27 }, { "epoch": 0.0017529337293665092, "eval_loss": 1.1527782678604126, "eval_runtime": 2174.8129, "eval_samples_per_second": 11.928, "eval_steps_per_second": 1.491, "step": 27 }, { "epoch": 0.0019477041437405659, "grad_norm": 0.2434782236814499, "learning_rate": 8.83022221559489e-05, "loss": 1.0845, "step": 30 }, { "epoch": 0.0021424745581146223, "grad_norm": 0.1957402527332306, "learning_rate": 8.473291852294987e-05, "loss": 1.0744, "step": 33 }, { "epoch": 0.002337244972488679, "grad_norm": 0.20614418387413025, "learning_rate": 8.07830737662829e-05, "loss": 1.0835, "step": 36 }, { "epoch": 0.002337244972488679, "eval_loss": 1.136653184890747, "eval_runtime": 2176.2744, "eval_samples_per_second": 11.92, "eval_steps_per_second": 1.49, "step": 36 }, { "epoch": 0.0025320153868627357, "grad_norm": 0.19403059780597687, "learning_rate": 7.649596321166024e-05, "loss": 1.2458, "step": 39 }, { "epoch": 0.002726785801236792, "grad_norm": 0.2524452805519104, "learning_rate": 7.191855733945387e-05, "loss": 1.0825, "step": 42 }, { "epoch": 0.0029215562156108487, "grad_norm": 0.22213758528232574, "learning_rate": 6.710100716628344e-05, "loss": 1.1252, "step": 45 }, { "epoch": 0.0029215562156108487, "eval_loss": 1.1249250173568726, "eval_runtime": 2174.4018, "eval_samples_per_second": 11.931, "eval_steps_per_second": 1.491, "step": 45 }, { "epoch": 0.0031163266299849054, "grad_norm": 0.3361480236053467, "learning_rate": 6.209609477998338e-05, "loss": 1.1268, "step": 48 }, { "epoch": 0.003311097044358962, "grad_norm": 0.26797690987586975, "learning_rate": 5.695865504800327e-05, "loss": 1.0806, "step": 51 }, { "epoch": 0.0035058674587330183, "grad_norm": 0.24251677095890045, "learning_rate": 5.174497483512506e-05, "loss": 1.1435, "step": 54 }, { "epoch": 0.0035058674587330183, "eval_loss": 1.1167911291122437, "eval_runtime": 2174.2458, "eval_samples_per_second": 11.931, "eval_steps_per_second": 1.492, "step": 54 }, { "epoch": 0.003700637873107075, "grad_norm": 0.25566375255584717, "learning_rate": 4.6512176312793736e-05, "loss": 1.1187, "step": 57 }, { "epoch": 0.0038954082874811317, "grad_norm": 0.22987186908721924, "learning_rate": 4.131759111665349e-05, "loss": 1.1929, "step": 60 }, { "epoch": 0.004090178701855188, "grad_norm": 0.2270929515361786, "learning_rate": 3.6218132209150045e-05, "loss": 1.061, "step": 63 }, { "epoch": 0.004090178701855188, "eval_loss": 1.1118545532226562, "eval_runtime": 2176.6142, "eval_samples_per_second": 11.919, "eval_steps_per_second": 1.49, "step": 63 }, { "epoch": 0.004284949116229245, "grad_norm": 0.24837514758110046, "learning_rate": 3.12696703292044e-05, "loss": 1.0437, "step": 66 }, { "epoch": 0.004479719530603301, "grad_norm": 0.2640354037284851, "learning_rate": 2.6526421860705473e-05, "loss": 0.9759, "step": 69 }, { "epoch": 0.004674489944977358, "grad_norm": 0.2336353361606598, "learning_rate": 2.2040354826462668e-05, "loss": 1.009, "step": 72 }, { "epoch": 0.004674489944977358, "eval_loss": 1.107908844947815, "eval_runtime": 2174.4566, "eval_samples_per_second": 11.93, "eval_steps_per_second": 1.491, "step": 72 }, { "epoch": 0.004869260359351414, "grad_norm": 0.2683839201927185, "learning_rate": 1.7860619515673033e-05, "loss": 1.0181, "step": 75 }, { "epoch": 0.0050640307737254715, "grad_norm": 0.2877131998538971, "learning_rate": 1.4033009983067452e-05, "loss": 1.0598, "step": 78 }, { "epoch": 0.005258801188099528, "grad_norm": 0.278920441865921, "learning_rate": 1.0599462319663905e-05, "loss": 1.1018, "step": 81 }, { "epoch": 0.005258801188099528, "eval_loss": 1.1061383485794067, "eval_runtime": 2176.8672, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 81 }, { "epoch": 0.005453571602473584, "grad_norm": 0.2761317491531372, "learning_rate": 7.597595192178702e-06, "loss": 1.024, "step": 84 }, { "epoch": 0.005648342016847641, "grad_norm": 0.29402637481689453, "learning_rate": 5.060297685041659e-06, "loss": 1.2208, "step": 87 }, { "epoch": 0.005843112431221697, "grad_norm": 0.1755310297012329, "learning_rate": 3.0153689607045845e-06, "loss": 0.951, "step": 90 }, { "epoch": 0.005843112431221697, "eval_loss": 1.1051793098449707, "eval_runtime": 2177.0043, "eval_samples_per_second": 11.916, "eval_steps_per_second": 1.49, "step": 90 }, { "epoch": 0.006037882845595754, "grad_norm": 0.2584977447986603, "learning_rate": 1.4852136862001764e-06, "loss": 1.0409, "step": 93 }, { "epoch": 0.006232653259969811, "grad_norm": 0.21517957746982574, "learning_rate": 4.865965629214819e-07, "loss": 1.102, "step": 96 }, { "epoch": 0.006427423674343867, "grad_norm": 0.2704929709434509, "learning_rate": 3.04586490452119e-08, "loss": 1.256, "step": 99 }, { "epoch": 0.006427423674343867, "eval_loss": 1.1049902439117432, "eval_runtime": 2176.592, "eval_samples_per_second": 11.919, "eval_steps_per_second": 1.49, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6401962573286605e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }