{ "best_metric": 1.227325201034546, "best_model_checkpoint": "miner_id_24/checkpoint-350", "epoch": 1.0012453300124533, "eval_steps": 50, "global_step": 402, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024906600249066002, "eval_loss": 2.2299909591674805, "eval_runtime": 18.2176, "eval_samples_per_second": 9.277, "eval_steps_per_second": 2.36, "step": 1 }, { "epoch": 0.024906600249066, "grad_norm": 7.265342712402344, "learning_rate": 4.0600000000000004e-05, "loss": 2.903, "step": 10 }, { "epoch": 0.049813200498132, "grad_norm": 8.326602935791016, "learning_rate": 8.120000000000001e-05, "loss": 3.235, "step": 20 }, { "epoch": 0.074719800747198, "grad_norm": 7.7111029624938965, "learning_rate": 0.00012179999999999999, "loss": 2.9605, "step": 30 }, { "epoch": 0.099626400996264, "grad_norm": 9.1806640625, "learning_rate": 0.00016240000000000002, "loss": 2.9713, "step": 40 }, { "epoch": 0.12453300124533001, "grad_norm": 8.711584091186523, "learning_rate": 0.000203, "loss": 2.537, "step": 50 }, { "epoch": 0.12453300124533001, "eval_loss": 1.6622552871704102, "eval_runtime": 18.3781, "eval_samples_per_second": 9.196, "eval_steps_per_second": 2.34, "step": 50 }, { "epoch": 0.149439601494396, "grad_norm": 5.675422191619873, "learning_rate": 0.00020259601786141667, "loss": 2.9775, "step": 60 }, { "epoch": 0.17434620174346202, "grad_norm": 7.719958305358887, "learning_rate": 0.00020138728724011578, "loss": 2.9284, "step": 70 }, { "epoch": 0.199252801992528, "grad_norm": 6.898895263671875, "learning_rate": 0.00019938342992095184, "loss": 2.81, "step": 80 }, { "epoch": 0.22415940224159403, "grad_norm": 8.30861759185791, "learning_rate": 0.00019660039708747584, "loss": 2.5408, "step": 90 }, { "epoch": 0.24906600249066002, "grad_norm": 10.431880950927734, "learning_rate": 0.0001930603423466988, "loss": 2.2706, "step": 100 }, { "epoch": 0.24906600249066002, "eval_loss": 1.626971960067749, "eval_runtime": 18.3856, "eval_samples_per_second": 9.192, "eval_steps_per_second": 2.339, "step": 100 }, { "epoch": 0.273972602739726, "grad_norm": 5.046250343322754, "learning_rate": 0.0001887914453810816, "loss": 3.2328, "step": 110 }, { "epoch": 0.298879202988792, "grad_norm": 7.967824459075928, "learning_rate": 0.0001838276876315219, "loss": 2.9374, "step": 120 }, { "epoch": 0.32378580323785805, "grad_norm": 7.071413993835449, "learning_rate": 0.00017820858179695722, "loss": 2.5715, "step": 130 }, { "epoch": 0.34869240348692404, "grad_norm": 7.455673694610596, "learning_rate": 0.00017197885730383037, "loss": 2.5618, "step": 140 }, { "epoch": 0.37359900373599003, "grad_norm": 6.7585673332214355, "learning_rate": 0.00016518810424915649, "loss": 2.2597, "step": 150 }, { "epoch": 0.37359900373599003, "eval_loss": 1.6305289268493652, "eval_runtime": 18.1899, "eval_samples_per_second": 9.291, "eval_steps_per_second": 2.364, "step": 150 }, { "epoch": 0.398505603985056, "grad_norm": 5.758382320404053, "learning_rate": 0.00015789037865148963, "loss": 2.7558, "step": 160 }, { "epoch": 0.42341220423412207, "grad_norm": 5.175365447998047, "learning_rate": 0.00015014377215208578, "loss": 2.8974, "step": 170 }, { "epoch": 0.44831880448318806, "grad_norm": 7.262718677520752, "learning_rate": 0.00014200994959154376, "loss": 2.4825, "step": 180 }, { "epoch": 0.47322540473225405, "grad_norm": 7.489962577819824, "learning_rate": 0.00013355365814292505, "loss": 2.6472, "step": 190 }, { "epoch": 0.49813200498132004, "grad_norm": 6.218911647796631, "learning_rate": 0.00012484221190876938, "loss": 2.0886, "step": 200 }, { "epoch": 0.49813200498132004, "eval_loss": 1.5863020420074463, "eval_runtime": 18.3865, "eval_samples_per_second": 9.192, "eval_steps_per_second": 2.339, "step": 200 }, { "epoch": 0.523038605230386, "grad_norm": 4.3309550285339355, "learning_rate": 0.00011594495608473845, "loss": 3.1857, "step": 210 }, { "epoch": 0.547945205479452, "grad_norm": 6.128057479858398, "learning_rate": 0.00010693271495527193, "loss": 2.526, "step": 220 }, { "epoch": 0.572851805728518, "grad_norm": 7.040269374847412, "learning_rate": 9.78772281153435e-05, "loss": 2.3066, "step": 230 }, { "epoch": 0.597758405977584, "grad_norm": 7.6243133544921875, "learning_rate": 8.885057940612564e-05, "loss": 2.5023, "step": 240 }, { "epoch": 0.6226650062266501, "grad_norm": 8.762333869934082, "learning_rate": 7.992462311037286e-05, "loss": 2.1512, "step": 250 }, { "epoch": 0.6226650062266501, "eval_loss": 1.4163031578063965, "eval_runtime": 18.3612, "eval_samples_per_second": 9.204, "eval_steps_per_second": 2.342, "step": 250 }, { "epoch": 0.6475716064757161, "grad_norm": 4.309693336486816, "learning_rate": 7.117041197514451e-05, "loss": 2.8092, "step": 260 }, { "epoch": 0.6724782067247821, "grad_norm": 5.296663761138916, "learning_rate": 6.265763161494339e-05, "loss": 2.6797, "step": 270 }, { "epoch": 0.6973848069738481, "grad_norm": 6.377048015594482, "learning_rate": 5.44540457975568e-05, "loss": 2.2562, "step": 280 }, { "epoch": 0.7222914072229141, "grad_norm": 7.426491737365723, "learning_rate": 4.662495702825688e-05, "loss": 2.527, "step": 290 }, { "epoch": 0.7471980074719801, "grad_norm": 4.965656757354736, "learning_rate": 3.9232686726239235e-05, "loss": 1.9925, "step": 300 }, { "epoch": 0.7471980074719801, "eval_loss": 1.3612326383590698, "eval_runtime": 18.2162, "eval_samples_per_second": 9.277, "eval_steps_per_second": 2.361, "step": 300 }, { "epoch": 0.772104607721046, "grad_norm": 3.807908296585083, "learning_rate": 3.2336079131220047e-05, "loss": 2.7977, "step": 310 }, { "epoch": 0.797011207970112, "grad_norm": 4.878301620483398, "learning_rate": 2.5990032889213268e-05, "loss": 2.6721, "step": 320 }, { "epoch": 0.821917808219178, "grad_norm": 5.5696516036987305, "learning_rate": 2.024506404617743e-05, "loss": 2.4447, "step": 330 }, { "epoch": 0.8468244084682441, "grad_norm": 6.529803276062012, "learning_rate": 1.5146903928207132e-05, "loss": 2.2873, "step": 340 }, { "epoch": 0.8717310087173101, "grad_norm": 5.140824794769287, "learning_rate": 1.0736135109239304e-05, "loss": 2.1168, "step": 350 }, { "epoch": 0.8717310087173101, "eval_loss": 1.227325201034546, "eval_runtime": 18.3672, "eval_samples_per_second": 9.201, "eval_steps_per_second": 2.341, "step": 350 }, { "epoch": 0.8966376089663761, "grad_norm": 4.607552528381348, "learning_rate": 7.047868364057873e-06, "loss": 2.6504, "step": 360 }, { "epoch": 0.9215442092154421, "grad_norm": 3.7642176151275635, "learning_rate": 4.111463178128528e-06, "loss": 2.2523, "step": 370 }, { "epoch": 0.9464508094645081, "grad_norm": 4.930432319641113, "learning_rate": 1.950294039072111e-06, "loss": 1.9105, "step": 380 }, { "epoch": 0.9713574097135741, "grad_norm": 5.943967342376709, "learning_rate": 5.815643701524193e-07, "loss": 2.2674, "step": 390 }, { "epoch": 0.9962640099626401, "grad_norm": 4.921615123748779, "learning_rate": 1.6169586912166855e-08, "loss": 1.8918, "step": 400 }, { "epoch": 0.9962640099626401, "eval_loss": 1.2326724529266357, "eval_runtime": 18.3821, "eval_samples_per_second": 9.194, "eval_steps_per_second": 2.339, "step": 400 } ], "logging_steps": 10, "max_steps": 402, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3929871243739136e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }