{ "best_metric": 11.72977352142334, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.5830903790087464, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011661807580174927, "eval_loss": 11.762678146362305, "eval_runtime": 1.6672, "eval_samples_per_second": 216.536, "eval_steps_per_second": 54.584, "step": 1 }, { "epoch": 0.011661807580174927, "grad_norm": 0.1031990498304367, "learning_rate": 4.0600000000000004e-05, "loss": 11.7629, "step": 10 }, { "epoch": 0.023323615160349854, "grad_norm": 0.12429989129304886, "learning_rate": 8.120000000000001e-05, "loss": 11.7609, "step": 20 }, { "epoch": 0.03498542274052478, "grad_norm": 0.188057541847229, "learning_rate": 0.00012179999999999999, "loss": 11.7631, "step": 30 }, { "epoch": 0.04664723032069971, "grad_norm": 0.19203342497348785, "learning_rate": 0.00016240000000000002, "loss": 11.7586, "step": 40 }, { "epoch": 0.05830903790087463, "grad_norm": 0.3170674443244934, "learning_rate": 0.000203, "loss": 11.7389, "step": 50 }, { "epoch": 0.05830903790087463, "eval_loss": 11.746705055236816, "eval_runtime": 1.6333, "eval_samples_per_second": 221.02, "eval_steps_per_second": 55.714, "step": 50 }, { "epoch": 0.06997084548104957, "grad_norm": 0.2140980362892151, "learning_rate": 0.00020275275110137215, "loss": 11.7573, "step": 60 }, { "epoch": 0.08163265306122448, "grad_norm": 0.1588020920753479, "learning_rate": 0.00020201220897726938, "loss": 11.7488, "step": 70 }, { "epoch": 0.09329446064139942, "grad_norm": 0.20971915125846863, "learning_rate": 0.00020078198147448128, "loss": 11.7447, "step": 80 }, { "epoch": 0.10495626822157435, "grad_norm": 0.19178971648216248, "learning_rate": 0.00019906806213773937, "loss": 11.7421, "step": 90 }, { "epoch": 0.11661807580174927, "grad_norm": 0.22939780354499817, "learning_rate": 0.0001968788010097697, "loss": 11.7342, "step": 100 }, { "epoch": 0.11661807580174927, "eval_loss": 11.742330551147461, "eval_runtime": 1.6574, "eval_samples_per_second": 217.817, "eval_steps_per_second": 54.907, "step": 100 }, { "epoch": 0.1282798833819242, "grad_norm": 0.19177477061748505, "learning_rate": 0.00019422486395072398, "loss": 11.7501, "step": 110 }, { "epoch": 0.13994169096209913, "grad_norm": 0.15727351605892181, "learning_rate": 0.0001911191806751811, "loss": 11.7447, "step": 120 }, { "epoch": 0.15160349854227406, "grad_norm": 0.1680230051279068, "learning_rate": 0.00018757688175987723, "loss": 11.735, "step": 130 }, { "epoch": 0.16326530612244897, "grad_norm": 0.1868034452199936, "learning_rate": 0.00018361522492905716, "loss": 11.735, "step": 140 }, { "epoch": 0.1749271137026239, "grad_norm": 0.3084157109260559, "learning_rate": 0.00017925351097657625, "loss": 11.7273, "step": 150 }, { "epoch": 0.1749271137026239, "eval_loss": 11.739017486572266, "eval_runtime": 1.6264, "eval_samples_per_second": 221.96, "eval_steps_per_second": 55.951, "step": 150 }, { "epoch": 0.18658892128279883, "grad_norm": 0.24353830516338348, "learning_rate": 0.00017451298973437308, "loss": 11.7474, "step": 160 }, { "epoch": 0.19825072886297376, "grad_norm": 0.2174033671617508, "learning_rate": 0.0001694167565454241, "loss": 11.742, "step": 170 }, { "epoch": 0.2099125364431487, "grad_norm": 0.1835821270942688, "learning_rate": 0.0001639896397455543, "loss": 11.739, "step": 180 }, { "epoch": 0.22157434402332363, "grad_norm": 0.18794667720794678, "learning_rate": 0.0001582580797022808, "loss": 11.7329, "step": 190 }, { "epoch": 0.23323615160349853, "grad_norm": 0.3341284394264221, "learning_rate": 0.00015225, "loss": 11.7255, "step": 200 }, { "epoch": 0.23323615160349853, "eval_loss": 11.735800743103027, "eval_runtime": 1.6348, "eval_samples_per_second": 220.82, "eval_steps_per_second": 55.664, "step": 200 }, { "epoch": 0.24489795918367346, "grad_norm": 0.19055917859077454, "learning_rate": 0.00014599467139909136, "loss": 11.7397, "step": 210 }, { "epoch": 0.2565597667638484, "grad_norm": 0.19719476997852325, "learning_rate": 0.0001395225692317151, "loss": 11.7356, "step": 220 }, { "epoch": 0.26822157434402333, "grad_norm": 0.17899419367313385, "learning_rate": 0.00013286522492905717, "loss": 11.7271, "step": 230 }, { "epoch": 0.27988338192419826, "grad_norm": 0.2608484923839569, "learning_rate": 0.00012605507240336626, "loss": 11.729, "step": 240 }, { "epoch": 0.2915451895043732, "grad_norm": 0.29318031668663025, "learning_rate": 0.00011912529003319345, "loss": 11.7252, "step": 250 }, { "epoch": 0.2915451895043732, "eval_loss": 11.734283447265625, "eval_runtime": 1.6363, "eval_samples_per_second": 220.625, "eval_steps_per_second": 55.615, "step": 250 }, { "epoch": 0.3032069970845481, "grad_norm": 0.1815534234046936, "learning_rate": 0.00011210963902166683, "loss": 11.7412, "step": 260 }, { "epoch": 0.31486880466472306, "grad_norm": 0.2676528990268707, "learning_rate": 0.00010504229891530386, "loss": 11.7368, "step": 270 }, { "epoch": 0.32653061224489793, "grad_norm": 0.2543658912181854, "learning_rate": 9.795770108469618e-05, "loss": 11.7335, "step": 280 }, { "epoch": 0.33819241982507287, "grad_norm": 0.25447171926498413, "learning_rate": 9.08903609783332e-05, "loss": 11.7292, "step": 290 }, { "epoch": 0.3498542274052478, "grad_norm": 0.3657534718513489, "learning_rate": 8.387470996680658e-05, "loss": 11.7254, "step": 300 }, { "epoch": 0.3498542274052478, "eval_loss": 11.732454299926758, "eval_runtime": 1.6359, "eval_samples_per_second": 220.675, "eval_steps_per_second": 55.627, "step": 300 }, { "epoch": 0.36151603498542273, "grad_norm": 0.19558283686637878, "learning_rate": 7.694492759663374e-05, "loss": 11.7372, "step": 310 }, { "epoch": 0.37317784256559766, "grad_norm": 0.29391545057296753, "learning_rate": 7.013477507094284e-05, "loss": 11.7345, "step": 320 }, { "epoch": 0.3848396501457726, "grad_norm": 0.25074857473373413, "learning_rate": 6.347743076828492e-05, "loss": 11.7324, "step": 330 }, { "epoch": 0.3965014577259475, "grad_norm": 0.1810549795627594, "learning_rate": 5.700532860090863e-05, "loss": 11.7274, "step": 340 }, { "epoch": 0.40816326530612246, "grad_norm": 0.3190612196922302, "learning_rate": 5.075000000000002e-05, "loss": 11.7232, "step": 350 }, { "epoch": 0.40816326530612246, "eval_loss": 11.731415748596191, "eval_runtime": 1.6273, "eval_samples_per_second": 221.846, "eval_steps_per_second": 55.922, "step": 350 }, { "epoch": 0.4198250728862974, "grad_norm": 0.2629506587982178, "learning_rate": 4.4741920297719214e-05, "loss": 11.7342, "step": 360 }, { "epoch": 0.4314868804664723, "grad_norm": 0.24675559997558594, "learning_rate": 3.901036025444568e-05, "loss": 11.7363, "step": 370 }, { "epoch": 0.44314868804664725, "grad_norm": 0.23566007614135742, "learning_rate": 3.358324345457592e-05, "loss": 11.73, "step": 380 }, { "epoch": 0.45481049562682213, "grad_norm": 0.24955187737941742, "learning_rate": 2.8487010265626928e-05, "loss": 11.7289, "step": 390 }, { "epoch": 0.46647230320699706, "grad_norm": 0.33412033319473267, "learning_rate": 2.3746489023423744e-05, "loss": 11.7211, "step": 400 }, { "epoch": 0.46647230320699706, "eval_loss": 11.730313301086426, "eval_runtime": 1.6308, "eval_samples_per_second": 221.361, "eval_steps_per_second": 55.8, "step": 400 }, { "epoch": 0.478134110787172, "grad_norm": 0.21241024136543274, "learning_rate": 1.9384775070942844e-05, "loss": 11.7359, "step": 410 }, { "epoch": 0.4897959183673469, "grad_norm": 0.28444772958755493, "learning_rate": 1.5423118240122765e-05, "loss": 11.7339, "step": 420 }, { "epoch": 0.5014577259475219, "grad_norm": 0.2678728401660919, "learning_rate": 1.188081932481891e-05, "loss": 11.7255, "step": 430 }, { "epoch": 0.5131195335276968, "grad_norm": 0.3324980139732361, "learning_rate": 8.775136049276001e-06, "loss": 11.7318, "step": 440 }, { "epoch": 0.5247813411078717, "grad_norm": 0.48563843965530396, "learning_rate": 6.121198990230306e-06, "loss": 11.7233, "step": 450 }, { "epoch": 0.5247813411078717, "eval_loss": 11.73011302947998, "eval_runtime": 1.6604, "eval_samples_per_second": 217.421, "eval_steps_per_second": 54.807, "step": 450 }, { "epoch": 0.5364431486880467, "grad_norm": 0.25317201018333435, "learning_rate": 3.931937862260632e-06, "loss": 11.7334, "step": 460 }, { "epoch": 0.5481049562682215, "grad_norm": 0.28131625056266785, "learning_rate": 2.2180185255187225e-06, "loss": 11.7322, "step": 470 }, { "epoch": 0.5597667638483965, "grad_norm": 0.21770986914634705, "learning_rate": 9.877910227306082e-07, "loss": 11.7306, "step": 480 }, { "epoch": 0.5714285714285714, "grad_norm": 0.31401193141937256, "learning_rate": 2.472488986278439e-07, "loss": 11.7282, "step": 490 }, { "epoch": 0.5830903790087464, "grad_norm": 0.3276008665561676, "learning_rate": 0.0, "loss": 11.7215, "step": 500 }, { "epoch": 0.5830903790087464, "eval_loss": 11.72977352142334, "eval_runtime": 1.6296, "eval_samples_per_second": 221.53, "eval_steps_per_second": 55.843, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 25896615936000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }