{ "best_metric": 1.2338447570800781, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.6901311249137336, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013802622498274672, "eval_loss": 3.10593581199646, "eval_runtime": 18.6566, "eval_samples_per_second": 16.348, "eval_steps_per_second": 4.127, "step": 1 }, { "epoch": 0.013802622498274672, "grad_norm": 24.552059173583984, "learning_rate": 4.1400000000000003e-05, "loss": 5.1127, "step": 10 }, { "epoch": 0.027605244996549344, "grad_norm": 15.504034042358398, "learning_rate": 8.280000000000001e-05, "loss": 3.841, "step": 20 }, { "epoch": 0.041407867494824016, "grad_norm": 14.46058177947998, "learning_rate": 0.00012419999999999998, "loss": 3.534, "step": 30 }, { "epoch": 0.05521048999309869, "grad_norm": 51.250701904296875, "learning_rate": 0.00016560000000000001, "loss": 3.9728, "step": 40 }, { "epoch": 0.06901311249137336, "grad_norm": 27.90164566040039, "learning_rate": 0.000207, "loss": 3.3681, "step": 50 }, { "epoch": 0.06901311249137336, "eval_loss": 2.3035168647766113, "eval_runtime": 18.6377, "eval_samples_per_second": 16.365, "eval_steps_per_second": 4.131, "step": 50 }, { "epoch": 0.08281573498964803, "grad_norm": 16.63316535949707, "learning_rate": 0.00020674787920189178, "loss": 4.045, "step": 60 }, { "epoch": 0.0966183574879227, "grad_norm": 14.355842590332031, "learning_rate": 0.00020599274511475253, "loss": 3.2498, "step": 70 }, { "epoch": 0.11042097998619738, "grad_norm": 14.458366394042969, "learning_rate": 0.00020473827667594888, "loss": 3.7108, "step": 80 }, { "epoch": 0.12422360248447205, "grad_norm": 16.983057022094727, "learning_rate": 0.00020299058552961598, "loss": 3.4946, "step": 90 }, { "epoch": 0.13802622498274672, "grad_norm": 30.41816520690918, "learning_rate": 0.00020075818625134152, "loss": 3.7154, "step": 100 }, { "epoch": 0.13802622498274672, "eval_loss": 2.316256523132324, "eval_runtime": 18.6116, "eval_samples_per_second": 16.388, "eval_steps_per_second": 4.137, "step": 100 }, { "epoch": 0.1518288474810214, "grad_norm": 69.45185852050781, "learning_rate": 0.00019805195486600916, "loss": 3.5852, "step": 110 }, { "epoch": 0.16563146997929606, "grad_norm": 14.340858459472656, "learning_rate": 0.00019488507586089894, "loss": 3.4744, "step": 120 }, { "epoch": 0.17943409247757075, "grad_norm": 28.43404197692871, "learning_rate": 0.00019127297795219008, "loss": 3.1552, "step": 130 }, { "epoch": 0.1932367149758454, "grad_norm": 18.572847366333008, "learning_rate": 0.00018723325891780706, "loss": 3.3675, "step": 140 }, { "epoch": 0.2070393374741201, "grad_norm": 26.13022804260254, "learning_rate": 0.0001827855998628142, "loss": 3.4886, "step": 150 }, { "epoch": 0.2070393374741201, "eval_loss": 1.9903616905212402, "eval_runtime": 18.6032, "eval_samples_per_second": 16.395, "eval_steps_per_second": 4.139, "step": 150 }, { "epoch": 0.22084195997239475, "grad_norm": 17.420490264892578, "learning_rate": 0.0001779516693350504, "loss": 3.5297, "step": 160 }, { "epoch": 0.23464458247066944, "grad_norm": 21.1903133392334, "learning_rate": 0.00017275501775814182, "loss": 3.1483, "step": 170 }, { "epoch": 0.2484472049689441, "grad_norm": 16.870092391967773, "learning_rate": 0.00016722096269620562, "loss": 2.9207, "step": 180 }, { "epoch": 0.26224982746721875, "grad_norm": 19.400983810424805, "learning_rate": 0.00016137646550922228, "loss": 4.4511, "step": 190 }, { "epoch": 0.27605244996549344, "grad_norm": 23.654308319091797, "learning_rate": 0.00015525, "loss": 2.9133, "step": 200 }, { "epoch": 0.27605244996549344, "eval_loss": 1.8559632301330566, "eval_runtime": 18.5949, "eval_samples_per_second": 16.402, "eval_steps_per_second": 4.141, "step": 200 }, { "epoch": 0.2898550724637681, "grad_norm": 27.395263671875, "learning_rate": 0.0001488714136926695, "loss": 3.4052, "step": 210 }, { "epoch": 0.3036576949620428, "grad_norm": 15.65333080291748, "learning_rate": 0.0001422717824185469, "loss": 3.1855, "step": 220 }, { "epoch": 0.31746031746031744, "grad_norm": 13.883142471313477, "learning_rate": 0.00013548325891780705, "loss": 3.1015, "step": 230 }, { "epoch": 0.33126293995859213, "grad_norm": 20.504344940185547, "learning_rate": 0.0001285389161945656, "loss": 3.6529, "step": 240 }, { "epoch": 0.3450655624568668, "grad_norm": 37.432334899902344, "learning_rate": 0.0001214725863885273, "loss": 4.7279, "step": 250 }, { "epoch": 0.3450655624568668, "eval_loss": 1.8165959119796753, "eval_runtime": 18.6944, "eval_samples_per_second": 16.315, "eval_steps_per_second": 4.119, "step": 250 }, { "epoch": 0.3588681849551415, "grad_norm": 16.661788940429688, "learning_rate": 0.00011431869594820213, "loss": 3.6444, "step": 260 }, { "epoch": 0.37267080745341613, "grad_norm": 21.61427116394043, "learning_rate": 0.00010711209790870886, "loss": 2.8124, "step": 270 }, { "epoch": 0.3864734299516908, "grad_norm": 14.587434768676758, "learning_rate": 9.988790209129117e-05, "loss": 2.6168, "step": 280 }, { "epoch": 0.4002760524499655, "grad_norm": 10.792880058288574, "learning_rate": 9.268130405179787e-05, "loss": 2.8998, "step": 290 }, { "epoch": 0.4140786749482402, "grad_norm": 19.90591049194336, "learning_rate": 8.55274136114727e-05, "loss": 2.9482, "step": 300 }, { "epoch": 0.4140786749482402, "eval_loss": 1.5225303173065186, "eval_runtime": 18.6815, "eval_samples_per_second": 16.326, "eval_steps_per_second": 4.122, "step": 300 }, { "epoch": 0.4278812974465148, "grad_norm": 12.331941604614258, "learning_rate": 7.84610838054344e-05, "loss": 2.8716, "step": 310 }, { "epoch": 0.4416839199447895, "grad_norm": 12.893694877624512, "learning_rate": 7.151674108219295e-05, "loss": 2.9602, "step": 320 }, { "epoch": 0.4554865424430642, "grad_norm": 10.044930458068848, "learning_rate": 6.472821758145309e-05, "loss": 2.7228, "step": 330 }, { "epoch": 0.4692891649413389, "grad_norm": 10.728809356689453, "learning_rate": 5.8128586307330475e-05, "loss": 2.9921, "step": 340 }, { "epoch": 0.4830917874396135, "grad_norm": 20.39414405822754, "learning_rate": 5.175000000000002e-05, "loss": 2.815, "step": 350 }, { "epoch": 0.4830917874396135, "eval_loss": 1.4471914768218994, "eval_runtime": 18.6607, "eval_samples_per_second": 16.344, "eval_steps_per_second": 4.126, "step": 350 }, { "epoch": 0.4968944099378882, "grad_norm": 11.737767219543457, "learning_rate": 4.5623534490777714e-05, "loss": 2.9584, "step": 360 }, { "epoch": 0.5106970324361628, "grad_norm": 16.33255958557129, "learning_rate": 3.9779037303794365e-05, "loss": 2.6765, "step": 370 }, { "epoch": 0.5244996549344375, "grad_norm": 10.07257080078125, "learning_rate": 3.42449822418582e-05, "loss": 2.715, "step": 380 }, { "epoch": 0.5383022774327122, "grad_norm": 20.379079818725586, "learning_rate": 2.9048330664949622e-05, "loss": 2.5815, "step": 390 }, { "epoch": 0.5521048999309869, "grad_norm": 24.735910415649414, "learning_rate": 2.4214400137185785e-05, "loss": 2.5287, "step": 400 }, { "epoch": 0.5521048999309869, "eval_loss": 1.2898170948028564, "eval_runtime": 18.6447, "eval_samples_per_second": 16.359, "eval_steps_per_second": 4.13, "step": 400 }, { "epoch": 0.5659075224292616, "grad_norm": 10.461315155029297, "learning_rate": 1.976674108219295e-05, "loss": 2.38, "step": 410 }, { "epoch": 0.5797101449275363, "grad_norm": 12.781827926635742, "learning_rate": 1.572702204780991e-05, "loss": 2.5618, "step": 420 }, { "epoch": 0.5935127674258109, "grad_norm": 12.61215877532959, "learning_rate": 1.2114924139101056e-05, "loss": 2.1771, "step": 430 }, { "epoch": 0.6073153899240856, "grad_norm": 12.710418701171875, "learning_rate": 8.948045133990798e-06, "loss": 2.2393, "step": 440 }, { "epoch": 0.6211180124223602, "grad_norm": 20.297300338745117, "learning_rate": 6.241813748658489e-06, "loss": 2.4048, "step": 450 }, { "epoch": 0.6211180124223602, "eval_loss": 1.241526484489441, "eval_runtime": 18.7361, "eval_samples_per_second": 16.279, "eval_steps_per_second": 4.11, "step": 450 }, { "epoch": 0.6349206349206349, "grad_norm": 13.001998901367188, "learning_rate": 4.009414470383994e-06, "loss": 2.4387, "step": 460 }, { "epoch": 0.6487232574189096, "grad_norm": 10.605116844177246, "learning_rate": 2.261723324051111e-06, "loss": 2.2201, "step": 470 }, { "epoch": 0.6625258799171843, "grad_norm": 8.887131690979004, "learning_rate": 1.0072548852474675e-06, "loss": 1.831, "step": 480 }, { "epoch": 0.6763285024154589, "grad_norm": 15.026272773742676, "learning_rate": 2.5212079810819554e-07, "loss": 2.6445, "step": 490 }, { "epoch": 0.6901311249137336, "grad_norm": 23.53835678100586, "learning_rate": 0.0, "loss": 2.4292, "step": 500 }, { "epoch": 0.6901311249137336, "eval_loss": 1.2338447570800781, "eval_runtime": 18.701, "eval_samples_per_second": 16.309, "eval_steps_per_second": 4.117, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.9437425893376e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }