{ "best_metric": 1.3169949054718018, "best_model_checkpoint": "miner_id_24/checkpoint-350", "epoch": 1.0012453300124533, "eval_steps": 50, "global_step": 402, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024906600249066002, "eval_loss": 2.3575375080108643, "eval_runtime": 9.0845, "eval_samples_per_second": 18.603, "eval_steps_per_second": 4.733, "step": 1 }, { "epoch": 0.024906600249066, "grad_norm": 1.6407170295715332, "learning_rate": 4.36e-05, "loss": 1.6376, "step": 10 }, { "epoch": 0.049813200498132, "grad_norm": 2.079932451248169, "learning_rate": 8.72e-05, "loss": 1.8074, "step": 20 }, { "epoch": 0.074719800747198, "grad_norm": 1.752514123916626, "learning_rate": 0.0001308, "loss": 1.5148, "step": 30 }, { "epoch": 0.099626400996264, "grad_norm": 2.3281145095825195, "learning_rate": 0.0001744, "loss": 1.618, "step": 40 }, { "epoch": 0.12453300124533001, "grad_norm": 2.0914270877838135, "learning_rate": 0.000218, "loss": 1.3761, "step": 50 }, { "epoch": 0.12453300124533001, "eval_loss": 1.7705373764038086, "eval_runtime": 9.1773, "eval_samples_per_second": 18.415, "eval_steps_per_second": 4.685, "step": 50 }, { "epoch": 0.149439601494396, "grad_norm": 1.1187117099761963, "learning_rate": 0.00021756616696447703, "loss": 1.6001, "step": 60 }, { "epoch": 0.17434620174346202, "grad_norm": 1.864639401435852, "learning_rate": 0.00021626812127263666, "loss": 1.5317, "step": 70 }, { "epoch": 0.199252801992528, "grad_norm": 1.9063708782196045, "learning_rate": 0.00021411619567865767, "loss": 1.4666, "step": 80 }, { "epoch": 0.22415940224159403, "grad_norm": 1.9959717988967896, "learning_rate": 0.00021112752002497406, "loss": 1.3854, "step": 90 }, { "epoch": 0.24906600249066002, "grad_norm": 1.7696688175201416, "learning_rate": 0.00020732588488463224, "loss": 1.0668, "step": 100 }, { "epoch": 0.24906600249066002, "eval_loss": 1.671539306640625, "eval_runtime": 9.1879, "eval_samples_per_second": 18.394, "eval_steps_per_second": 4.68, "step": 100 }, { "epoch": 0.273972602739726, "grad_norm": 1.1212961673736572, "learning_rate": 0.00020274155218263936, "loss": 1.7123, "step": 110 }, { "epoch": 0.298879202988792, "grad_norm": 1.5101659297943115, "learning_rate": 0.00019741101430380186, "loss": 1.5006, "step": 120 }, { "epoch": 0.32378580323785805, "grad_norm": 1.5983831882476807, "learning_rate": 0.00019137670360461418, "loss": 1.4108, "step": 130 }, { "epoch": 0.34869240348692404, "grad_norm": 1.7551649808883667, "learning_rate": 0.00018468665464155184, "loss": 1.363, "step": 140 }, { "epoch": 0.37359900373599003, "grad_norm": 1.9306334257125854, "learning_rate": 0.0001773941218045129, "loss": 1.0146, "step": 150 }, { "epoch": 0.37359900373599003, "eval_loss": 1.5669289827346802, "eval_runtime": 9.1037, "eval_samples_per_second": 18.564, "eval_steps_per_second": 4.723, "step": 150 }, { "epoch": 0.398505603985056, "grad_norm": 1.0432745218276978, "learning_rate": 0.00016955715539913665, "loss": 1.4577, "step": 160 }, { "epoch": 0.42341220423412207, "grad_norm": 1.7187490463256836, "learning_rate": 0.0001612381395524862, "loss": 1.4403, "step": 170 }, { "epoch": 0.44831880448318806, "grad_norm": 1.626038908958435, "learning_rate": 0.00015250329562047557, "loss": 1.2982, "step": 180 }, { "epoch": 0.47322540473225405, "grad_norm": 1.8136370182037354, "learning_rate": 0.00014342215505003773, "loss": 1.4308, "step": 190 }, { "epoch": 0.49813200498132004, "grad_norm": 1.1612805128097534, "learning_rate": 0.000134067005892176, "loss": 0.9796, "step": 200 }, { "epoch": 0.49813200498132004, "eval_loss": 1.5291515588760376, "eval_runtime": 9.1903, "eval_samples_per_second": 18.389, "eval_steps_per_second": 4.679, "step": 200 }, { "epoch": 0.523038605230386, "grad_norm": 1.3198999166488647, "learning_rate": 0.0001245123173717881, "loss": 1.8174, "step": 210 }, { "epoch": 0.547945205479452, "grad_norm": 1.404754638671875, "learning_rate": 0.00011483414709482405, "loss": 1.1952, "step": 220 }, { "epoch": 0.572851805728518, "grad_norm": 1.841070532798767, "learning_rate": 0.00010510953561155114, "loss": 1.2676, "step": 230 }, { "epoch": 0.597758405977584, "grad_norm": 1.8930388689041138, "learning_rate": 9.541589315534674e-05, "loss": 1.3132, "step": 240 }, { "epoch": 0.6226650062266501, "grad_norm": 1.8174850940704346, "learning_rate": 8.583038343872554e-05, "loss": 0.9781, "step": 250 }, { "epoch": 0.6226650062266501, "eval_loss": 1.4715014696121216, "eval_runtime": 9.1661, "eval_samples_per_second": 18.437, "eval_steps_per_second": 4.691, "step": 250 }, { "epoch": 0.6475716064757161, "grad_norm": 1.1012831926345825, "learning_rate": 7.642930941173154e-05, "loss": 1.4417, "step": 260 }, { "epoch": 0.6724782067247821, "grad_norm": 1.208770990371704, "learning_rate": 6.728750587220522e-05, "loss": 1.4184, "step": 270 }, { "epoch": 0.6973848069738481, "grad_norm": 1.6374764442443848, "learning_rate": 5.847774376289351e-05, "loss": 1.2979, "step": 280 }, { "epoch": 0.7222914072229141, "grad_norm": 1.8867082595825195, "learning_rate": 5.0070150897339905e-05, "loss": 1.3951, "step": 290 }, { "epoch": 0.7471980074719801, "grad_norm": 1.2463188171386719, "learning_rate": 4.213165372571504e-05, "loss": 0.8543, "step": 300 }, { "epoch": 0.7471980074719801, "eval_loss": 1.4179314374923706, "eval_runtime": 9.0764, "eval_samples_per_second": 18.62, "eval_steps_per_second": 4.738, "step": 300 }, { "epoch": 0.772104607721046, "grad_norm": 1.0654844045639038, "learning_rate": 3.472544458426586e-05, "loss": 1.4909, "step": 310 }, { "epoch": 0.797011207970112, "grad_norm": 1.1939270496368408, "learning_rate": 2.7910478669204397e-05, "loss": 1.3609, "step": 320 }, { "epoch": 0.821917808219178, "grad_norm": 1.3185659646987915, "learning_rate": 2.174100473924473e-05, "loss": 1.4049, "step": 330 }, { "epoch": 0.8468244084682441, "grad_norm": 1.7657341957092285, "learning_rate": 1.626613328250815e-05, "loss": 1.2549, "step": 340 }, { "epoch": 0.8717310087173101, "grad_norm": 1.3518775701522827, "learning_rate": 1.152944558529147e-05, "loss": 1.0009, "step": 350 }, { "epoch": 0.8717310087173101, "eval_loss": 1.3169949054718018, "eval_runtime": 9.1662, "eval_samples_per_second": 18.437, "eval_steps_per_second": 4.691, "step": 350 }, { "epoch": 0.8966376089663761, "grad_norm": 0.9178294539451599, "learning_rate": 7.568646814604021e-06, "loss": 1.3807, "step": 360 }, { "epoch": 0.9215442092154421, "grad_norm": 1.0393375158309937, "learning_rate": 4.4152658760198e-06, "loss": 1.2499, "step": 370 }, { "epoch": 0.9464508094645081, "grad_norm": 1.3953897953033447, "learning_rate": 2.094404436047883e-06, "loss": 1.0791, "step": 380 }, { "epoch": 0.9713574097135741, "grad_norm": 1.0463335514068604, "learning_rate": 6.245371068631892e-07, "loss": 1.1939, "step": 390 }, { "epoch": 0.9962640099626401, "grad_norm": 1.1366596221923828, "learning_rate": 1.736438397464224e-08, "loss": 0.9256, "step": 400 }, { "epoch": 0.9962640099626401, "eval_loss": 1.3189666271209717, "eval_runtime": 9.1794, "eval_samples_per_second": 18.411, "eval_steps_per_second": 4.684, "step": 400 } ], "logging_steps": 10, "max_steps": 402, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.46090111891669e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }