{ "best_metric": 1.8170868158340454, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.28425241614553726, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005685048322910744, "eval_loss": 2.708378791809082, "eval_runtime": 47.2835, "eval_samples_per_second": 15.671, "eval_steps_per_second": 3.934, "step": 1 }, { "epoch": 0.005685048322910745, "grad_norm": 2.072335958480835, "learning_rate": 4.08e-05, "loss": 2.3259, "step": 10 }, { "epoch": 0.01137009664582149, "grad_norm": 2.634218215942383, "learning_rate": 8.16e-05, "loss": 2.141, "step": 20 }, { "epoch": 0.017055144968732235, "grad_norm": 2.5346333980560303, "learning_rate": 0.0001224, "loss": 1.9537, "step": 30 }, { "epoch": 0.02274019329164298, "grad_norm": 5.2234578132629395, "learning_rate": 0.0001632, "loss": 1.9308, "step": 40 }, { "epoch": 0.028425241614553724, "grad_norm": 28.278820037841797, "learning_rate": 0.000204, "loss": 1.9311, "step": 50 }, { "epoch": 0.028425241614553724, "eval_loss": 2.325153350830078, "eval_runtime": 47.0492, "eval_samples_per_second": 15.749, "eval_steps_per_second": 3.953, "step": 50 }, { "epoch": 0.03411028993746447, "grad_norm": 1.899110198020935, "learning_rate": 0.00020375153312650207, "loss": 2.1988, "step": 60 }, { "epoch": 0.039795338260375214, "grad_norm": 2.418156385421753, "learning_rate": 0.00020300734301164017, "loss": 1.939, "step": 70 }, { "epoch": 0.04548038658328596, "grad_norm": 3.028864860534668, "learning_rate": 0.00020177105527484818, "loss": 1.9992, "step": 80 }, { "epoch": 0.051165434906196704, "grad_norm": 4.271291255950928, "learning_rate": 0.00020004869298570854, "loss": 1.8298, "step": 90 }, { "epoch": 0.05685048322910745, "grad_norm": 12.00307846069336, "learning_rate": 0.00019784864732016265, "loss": 2.0342, "step": 100 }, { "epoch": 0.05685048322910745, "eval_loss": 2.5892810821533203, "eval_runtime": 47.1934, "eval_samples_per_second": 15.701, "eval_steps_per_second": 3.941, "step": 100 }, { "epoch": 0.06253553155201819, "grad_norm": 1.9848482608795166, "learning_rate": 0.00019518163667954527, "loss": 2.3178, "step": 110 }, { "epoch": 0.06822057987492894, "grad_norm": 2.8821191787719727, "learning_rate": 0.00019206065447161056, "loss": 2.0378, "step": 120 }, { "epoch": 0.07390562819783968, "grad_norm": 2.8982577323913574, "learning_rate": 0.00018850090580795544, "loss": 1.9132, "step": 130 }, { "epoch": 0.07959067652075043, "grad_norm": 4.434988021850586, "learning_rate": 0.00018451973342624464, "loss": 2.0179, "step": 140 }, { "epoch": 0.08527572484366117, "grad_norm": 11.088447570800781, "learning_rate": 0.00018013653319813575, "loss": 2.1179, "step": 150 }, { "epoch": 0.08527572484366117, "eval_loss": 2.5300779342651367, "eval_runtime": 47.2175, "eval_samples_per_second": 15.693, "eval_steps_per_second": 3.939, "step": 150 }, { "epoch": 0.09096077316657192, "grad_norm": 2.0930769443511963, "learning_rate": 0.0001753726596345424, "loss": 2.3273, "step": 160 }, { "epoch": 0.09664582148948266, "grad_norm": 2.696667194366455, "learning_rate": 0.00017025132184860355, "loss": 1.9949, "step": 170 }, { "epoch": 0.10233086981239341, "grad_norm": 3.0587522983551025, "learning_rate": 0.00016479747048321714, "loss": 1.9261, "step": 180 }, { "epoch": 0.10801591813530415, "grad_norm": 5.556695938110352, "learning_rate": 0.00015903767615401616, "loss": 1.7835, "step": 190 }, { "epoch": 0.1137009664582149, "grad_norm": 11.964973449707031, "learning_rate": 0.000153, "loss": 1.8545, "step": 200 }, { "epoch": 0.1137009664582149, "eval_loss": 2.2151694297790527, "eval_runtime": 47.2341, "eval_samples_per_second": 15.688, "eval_steps_per_second": 3.938, "step": 200 }, { "epoch": 0.11938601478112564, "grad_norm": 1.8511581420898438, "learning_rate": 0.0001467138569724859, "loss": 2.2388, "step": 210 }, { "epoch": 0.12507106310403637, "grad_norm": 2.840690851211548, "learning_rate": 0.00014020987252842305, "loss": 2.0782, "step": 220 }, { "epoch": 0.13075611142694712, "grad_norm": 3.4517576694488525, "learning_rate": 0.00013351973342624464, "loss": 1.8506, "step": 230 }, { "epoch": 0.13644115974985788, "grad_norm": 3.9618589878082275, "learning_rate": 0.00012667603335116609, "loss": 1.7965, "step": 240 }, { "epoch": 0.14212620807276863, "grad_norm": 14.236225128173828, "learning_rate": 0.00011971211412202691, "loss": 2.114, "step": 250 }, { "epoch": 0.14212620807276863, "eval_loss": 2.101684093475342, "eval_runtime": 47.3607, "eval_samples_per_second": 15.646, "eval_steps_per_second": 3.927, "step": 250 }, { "epoch": 0.14781125639567935, "grad_norm": 1.728967308998108, "learning_rate": 0.00011266190325330066, "loss": 2.2158, "step": 260 }, { "epoch": 0.1534963047185901, "grad_norm": 2.7104427814483643, "learning_rate": 0.00010555974866365511, "loss": 1.9438, "step": 270 }, { "epoch": 0.15918135304150086, "grad_norm": 2.974550247192383, "learning_rate": 9.844025133634492e-05, "loss": 1.8889, "step": 280 }, { "epoch": 0.1648664013644116, "grad_norm": 4.237817287445068, "learning_rate": 9.133809674669937e-05, "loss": 1.8694, "step": 290 }, { "epoch": 0.17055144968732233, "grad_norm": 8.692949295043945, "learning_rate": 8.428788587797311e-05, "loss": 1.8763, "step": 300 }, { "epoch": 0.17055144968732233, "eval_loss": 2.04364275932312, "eval_runtime": 47.3289, "eval_samples_per_second": 15.656, "eval_steps_per_second": 3.93, "step": 300 }, { "epoch": 0.17623649801023308, "grad_norm": 2.5650382041931152, "learning_rate": 7.73239666488339e-05, "loss": 2.1507, "step": 310 }, { "epoch": 0.18192154633314384, "grad_norm": 2.478137493133545, "learning_rate": 7.048026657375537e-05, "loss": 1.975, "step": 320 }, { "epoch": 0.1876065946560546, "grad_norm": 5.501384735107422, "learning_rate": 6.379012747157697e-05, "loss": 1.7551, "step": 330 }, { "epoch": 0.1932916429789653, "grad_norm": 4.856377124786377, "learning_rate": 5.7286143027514095e-05, "loss": 1.7361, "step": 340 }, { "epoch": 0.19897669130187606, "grad_norm": 6.624533176422119, "learning_rate": 5.100000000000002e-05, "loss": 1.8018, "step": 350 }, { "epoch": 0.19897669130187606, "eval_loss": 1.9168442487716675, "eval_runtime": 47.4834, "eval_samples_per_second": 15.605, "eval_steps_per_second": 3.917, "step": 350 }, { "epoch": 0.20466173962478681, "grad_norm": 1.6058367490768433, "learning_rate": 4.496232384598384e-05, "loss": 2.2083, "step": 360 }, { "epoch": 0.21034678794769757, "grad_norm": 2.221337080001831, "learning_rate": 3.9202529516782854e-05, "loss": 1.9719, "step": 370 }, { "epoch": 0.2160318362706083, "grad_norm": 3.105900764465332, "learning_rate": 3.374867815139649e-05, "loss": 1.7793, "step": 380 }, { "epoch": 0.22171688459351904, "grad_norm": 3.6795544624328613, "learning_rate": 2.8627340365457602e-05, "loss": 1.72, "step": 390 }, { "epoch": 0.2274019329164298, "grad_norm": 11.990936279296875, "learning_rate": 2.3863466801864254e-05, "loss": 1.8091, "step": 400 }, { "epoch": 0.2274019329164298, "eval_loss": 1.850091576576233, "eval_runtime": 47.2393, "eval_samples_per_second": 15.686, "eval_steps_per_second": 3.937, "step": 400 }, { "epoch": 0.23308698123934055, "grad_norm": 1.6314740180969238, "learning_rate": 1.9480266573755372e-05, "loss": 2.1536, "step": 410 }, { "epoch": 0.23877202956225127, "grad_norm": 2.274401903152466, "learning_rate": 1.5499094192044554e-05, "loss": 2.0182, "step": 420 }, { "epoch": 0.24445707788516202, "grad_norm": 2.7430739402770996, "learning_rate": 1.1939345528389446e-05, "loss": 1.6865, "step": 430 }, { "epoch": 0.25014212620807275, "grad_norm": 3.536646842956543, "learning_rate": 8.818363320454701e-06, "loss": 1.8244, "step": 440 }, { "epoch": 0.2558271745309835, "grad_norm": 26.51901626586914, "learning_rate": 6.1513526798373514e-06, "loss": 2.0217, "step": 450 }, { "epoch": 0.2558271745309835, "eval_loss": 1.8225200176239014, "eval_runtime": 47.5365, "eval_samples_per_second": 15.588, "eval_steps_per_second": 3.913, "step": 450 }, { "epoch": 0.26151222285389425, "grad_norm": 1.6610099077224731, "learning_rate": 3.9513070142914725e-06, "loss": 2.0047, "step": 460 }, { "epoch": 0.26719727117680503, "grad_norm": 2.2376770973205566, "learning_rate": 2.2289447251518195e-06, "loss": 1.951, "step": 470 }, { "epoch": 0.27288231949971575, "grad_norm": 3.238219738006592, "learning_rate": 9.92656988359823e-07, "loss": 1.8643, "step": 480 }, { "epoch": 0.2785673678226265, "grad_norm": 4.071134567260742, "learning_rate": 2.4846687349793185e-07, "loss": 1.7363, "step": 490 }, { "epoch": 0.28425241614553726, "grad_norm": 6.9716572761535645, "learning_rate": 0.0, "loss": 1.707, "step": 500 }, { "epoch": 0.28425241614553726, "eval_loss": 1.8170868158340454, "eval_runtime": 47.4456, "eval_samples_per_second": 15.618, "eval_steps_per_second": 3.92, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.682771533574963e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }