{ "best_metric": 1.139582872390747, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 0.00865263977618505, "eval_steps": 50, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.768426517456701e-05, "eval_loss": 2.3238277435302734, "eval_runtime": 495.8901, "eval_samples_per_second": 58.878, "eval_steps_per_second": 14.721, "step": 1 }, { "epoch": 0.00017305279552370101, "grad_norm": 0.45083099603652954, "learning_rate": 3e-05, "loss": 1.4648, "step": 3 }, { "epoch": 0.00034610559104740203, "grad_norm": 0.5587688684463501, "learning_rate": 6e-05, "loss": 1.8717, "step": 6 }, { "epoch": 0.0005191583865711031, "grad_norm": 0.5804035067558289, "learning_rate": 9e-05, "loss": 1.8528, "step": 9 }, { "epoch": 0.0006922111820948041, "grad_norm": 0.5394102931022644, "learning_rate": 9.997266286704631e-05, "loss": 1.8216, "step": 12 }, { "epoch": 0.0008652639776185052, "grad_norm": 0.5287687182426453, "learning_rate": 9.98292246503335e-05, "loss": 1.4867, "step": 15 }, { "epoch": 0.0010383167731422061, "grad_norm": 0.5580219626426697, "learning_rate": 9.956320346634876e-05, "loss": 1.553, "step": 18 }, { "epoch": 0.0012113695686659072, "grad_norm": 0.5071175694465637, "learning_rate": 9.917525374361912e-05, "loss": 1.4905, "step": 21 }, { "epoch": 0.0013844223641896081, "grad_norm": 0.618292510509491, "learning_rate": 9.86663298624003e-05, "loss": 1.3951, "step": 24 }, { "epoch": 0.0015574751597133092, "grad_norm": 0.46973147988319397, "learning_rate": 9.803768380684242e-05, "loss": 1.5449, "step": 27 }, { "epoch": 0.0017305279552370103, "grad_norm": 0.49171945452690125, "learning_rate": 9.729086208503174e-05, "loss": 1.3479, "step": 30 }, { "epoch": 0.0019035807507607112, "grad_norm": 0.48666226863861084, "learning_rate": 9.642770192448536e-05, "loss": 1.4163, "step": 33 }, { "epoch": 0.0020766335462844123, "grad_norm": 0.5254663825035095, "learning_rate": 9.545032675245813e-05, "loss": 1.3606, "step": 36 }, { "epoch": 0.0022496863418081134, "grad_norm": 0.4926294684410095, "learning_rate": 9.43611409721806e-05, "loss": 1.3109, "step": 39 }, { "epoch": 0.0024227391373318145, "grad_norm": 0.5055580735206604, "learning_rate": 9.316282404787871e-05, "loss": 1.3493, "step": 42 }, { "epoch": 0.002595791932855515, "grad_norm": 0.5364347100257874, "learning_rate": 9.185832391312644e-05, "loss": 1.4115, "step": 45 }, { "epoch": 0.0027688447283792162, "grad_norm": 0.7558141350746155, "learning_rate": 9.045084971874738e-05, "loss": 1.4071, "step": 48 }, { "epoch": 0.0028842132587283504, "eval_loss": 1.2356951236724854, "eval_runtime": 500.8722, "eval_samples_per_second": 58.292, "eval_steps_per_second": 14.575, "step": 50 }, { "epoch": 0.0029418975239029173, "grad_norm": 0.4775656461715698, "learning_rate": 8.894386393810563e-05, "loss": 1.2319, "step": 51 }, { "epoch": 0.0031149503194266184, "grad_norm": 0.5036152601242065, "learning_rate": 8.73410738492077e-05, "loss": 0.9718, "step": 54 }, { "epoch": 0.0032880031149503195, "grad_norm": 0.39522024989128113, "learning_rate": 8.564642241456986e-05, "loss": 1.1598, "step": 57 }, { "epoch": 0.0034610559104740206, "grad_norm": 0.35405585169792175, "learning_rate": 8.386407858128706e-05, "loss": 1.0674, "step": 60 }, { "epoch": 0.0036341087059977213, "grad_norm": 0.4121133089065552, "learning_rate": 8.199842702516583e-05, "loss": 1.0174, "step": 63 }, { "epoch": 0.0038071615015214224, "grad_norm": 0.32193905115127563, "learning_rate": 8.005405736415126e-05, "loss": 1.0159, "step": 66 }, { "epoch": 0.003980214297045124, "grad_norm": 0.3453183174133301, "learning_rate": 7.803575286758364e-05, "loss": 1.0557, "step": 69 }, { "epoch": 0.0041532670925688246, "grad_norm": 0.3785501718521118, "learning_rate": 7.594847868906076e-05, "loss": 1.2081, "step": 72 }, { "epoch": 0.004326319888092525, "grad_norm": 0.3871416747570038, "learning_rate": 7.379736965185368e-05, "loss": 1.1649, "step": 75 }, { "epoch": 0.004499372683616227, "grad_norm": 0.4219188988208771, "learning_rate": 7.158771761692464e-05, "loss": 1.1471, "step": 78 }, { "epoch": 0.004672425479139927, "grad_norm": 0.40537410974502563, "learning_rate": 6.932495846462261e-05, "loss": 1.3124, "step": 81 }, { "epoch": 0.004845478274663629, "grad_norm": 0.36390063166618347, "learning_rate": 6.701465872208216e-05, "loss": 1.2589, "step": 84 }, { "epoch": 0.00501853107018733, "grad_norm": 0.4167007803916931, "learning_rate": 6.466250186922325e-05, "loss": 1.0981, "step": 87 }, { "epoch": 0.00519158386571103, "grad_norm": 0.4835963845252991, "learning_rate": 6.227427435703997e-05, "loss": 1.3458, "step": 90 }, { "epoch": 0.005364636661234732, "grad_norm": 0.46142578125, "learning_rate": 5.985585137257401e-05, "loss": 1.1397, "step": 93 }, { "epoch": 0.0055376894567584324, "grad_norm": 0.5481660962104797, "learning_rate": 5.74131823855921e-05, "loss": 1.1947, "step": 96 }, { "epoch": 0.005710742252282134, "grad_norm": 0.6424285173416138, "learning_rate": 5.495227651252315e-05, "loss": 1.4071, "step": 99 }, { "epoch": 0.005768426517456701, "eval_loss": 1.1644848585128784, "eval_runtime": 495.3076, "eval_samples_per_second": 58.947, "eval_steps_per_second": 14.738, "step": 100 }, { "epoch": 0.005883795047805835, "grad_norm": 0.46381473541259766, "learning_rate": 5.247918773366112e-05, "loss": 1.1083, "step": 102 }, { "epoch": 0.006056847843329536, "grad_norm": 0.5000247955322266, "learning_rate": 5e-05, "loss": 1.0165, "step": 105 }, { "epoch": 0.006229900638853237, "grad_norm": 0.38582298159599304, "learning_rate": 4.7520812266338885e-05, "loss": 1.059, "step": 108 }, { "epoch": 0.0064029534343769375, "grad_norm": 0.40634816884994507, "learning_rate": 4.504772348747687e-05, "loss": 1.021, "step": 111 }, { "epoch": 0.006576006229900639, "grad_norm": 0.32036200165748596, "learning_rate": 4.2586817614407895e-05, "loss": 0.9665, "step": 114 }, { "epoch": 0.00674905902542434, "grad_norm": 0.3690759241580963, "learning_rate": 4.0144148627425993e-05, "loss": 1.1389, "step": 117 }, { "epoch": 0.006922111820948041, "grad_norm": 0.3883983790874481, "learning_rate": 3.772572564296005e-05, "loss": 1.1566, "step": 120 }, { "epoch": 0.007095164616471742, "grad_norm": 0.4271601438522339, "learning_rate": 3.533749813077677e-05, "loss": 1.1872, "step": 123 }, { "epoch": 0.0072682174119954425, "grad_norm": 0.358237087726593, "learning_rate": 3.298534127791785e-05, "loss": 1.1692, "step": 126 }, { "epoch": 0.007441270207519144, "grad_norm": 0.365348219871521, "learning_rate": 3.0675041535377405e-05, "loss": 1.1708, "step": 129 }, { "epoch": 0.007614323003042845, "grad_norm": 0.41103696823120117, "learning_rate": 2.8412282383075363e-05, "loss": 1.2572, "step": 132 }, { "epoch": 0.007787375798566546, "grad_norm": 0.41762009263038635, "learning_rate": 2.6202630348146324e-05, "loss": 1.1534, "step": 135 }, { "epoch": 0.007960428594090248, "grad_norm": 0.39923688769340515, "learning_rate": 2.405152131093926e-05, "loss": 1.3375, "step": 138 }, { "epoch": 0.008133481389613948, "grad_norm": 0.47532254457473755, "learning_rate": 2.196424713241637e-05, "loss": 1.3354, "step": 141 }, { "epoch": 0.008306534185137649, "grad_norm": 0.3896162211894989, "learning_rate": 1.9945942635848748e-05, "loss": 1.3604, "step": 144 }, { "epoch": 0.00847958698066135, "grad_norm": 0.4989997446537018, "learning_rate": 1.800157297483417e-05, "loss": 1.3699, "step": 147 }, { "epoch": 0.00865263977618505, "grad_norm": 0.7773590087890625, "learning_rate": 1.6135921418712956e-05, "loss": 1.333, "step": 150 }, { "epoch": 0.00865263977618505, "eval_loss": 1.139582872390747, "eval_runtime": 496.6517, "eval_samples_per_second": 58.788, "eval_steps_per_second": 14.698, "step": 150 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0303319310336e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }