{ "best_metric": 1.5858772993087769, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.01620929440941436, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.241858881882872e-05, "eval_loss": 2.906552791595459, "eval_runtime": 397.3517, "eval_samples_per_second": 32.686, "eval_steps_per_second": 8.172, "step": 1 }, { "epoch": 0.0003241858881882872, "grad_norm": 6.263719081878662, "learning_rate": 4.1400000000000003e-05, "loss": 2.7332, "step": 10 }, { "epoch": 0.0006483717763765744, "grad_norm": 5.530974388122559, "learning_rate": 8.280000000000001e-05, "loss": 1.9108, "step": 20 }, { "epoch": 0.0009725576645648615, "grad_norm": 3.9441070556640625, "learning_rate": 0.00012419999999999998, "loss": 1.8745, "step": 30 }, { "epoch": 0.0012967435527531487, "grad_norm": 4.260671138763428, "learning_rate": 0.00016560000000000001, "loss": 1.7333, "step": 40 }, { "epoch": 0.0016209294409414358, "grad_norm": 4.642955303192139, "learning_rate": 0.000207, "loss": 1.6797, "step": 50 }, { "epoch": 0.0016209294409414358, "eval_loss": 1.8519707918167114, "eval_runtime": 398.2732, "eval_samples_per_second": 32.611, "eval_steps_per_second": 8.153, "step": 50 }, { "epoch": 0.001945115329129723, "grad_norm": 4.400924205780029, "learning_rate": 0.00020674787920189178, "loss": 1.868, "step": 60 }, { "epoch": 0.00226930121731801, "grad_norm": 5.454312324523926, "learning_rate": 0.00020599274511475253, "loss": 2.0069, "step": 70 }, { "epoch": 0.0025934871055062975, "grad_norm": 4.467811107635498, "learning_rate": 0.00020473827667594888, "loss": 1.7519, "step": 80 }, { "epoch": 0.0029176729936945846, "grad_norm": 4.802621841430664, "learning_rate": 0.00020299058552961598, "loss": 1.6975, "step": 90 }, { "epoch": 0.0032418588818828717, "grad_norm": 4.396114826202393, "learning_rate": 0.00020075818625134152, "loss": 1.6838, "step": 100 }, { "epoch": 0.0032418588818828717, "eval_loss": 1.8345727920532227, "eval_runtime": 398.358, "eval_samples_per_second": 32.604, "eval_steps_per_second": 8.151, "step": 100 }, { "epoch": 0.0035660447700711587, "grad_norm": 3.908578872680664, "learning_rate": 0.00019805195486600916, "loss": 2.0062, "step": 110 }, { "epoch": 0.003890230658259446, "grad_norm": 3.713029623031616, "learning_rate": 0.00019488507586089894, "loss": 1.7925, "step": 120 }, { "epoch": 0.004214416546447733, "grad_norm": 4.107682704925537, "learning_rate": 0.00019127297795219008, "loss": 1.8635, "step": 130 }, { "epoch": 0.00453860243463602, "grad_norm": 5.115502834320068, "learning_rate": 0.00018723325891780706, "loss": 1.8397, "step": 140 }, { "epoch": 0.004862788322824307, "grad_norm": 4.171757221221924, "learning_rate": 0.0001827855998628142, "loss": 1.8328, "step": 150 }, { "epoch": 0.004862788322824307, "eval_loss": 1.7966933250427246, "eval_runtime": 397.1601, "eval_samples_per_second": 32.702, "eval_steps_per_second": 8.176, "step": 150 }, { "epoch": 0.005186974211012595, "grad_norm": 3.389646053314209, "learning_rate": 0.0001779516693350504, "loss": 1.8123, "step": 160 }, { "epoch": 0.005511160099200882, "grad_norm": 4.33231258392334, "learning_rate": 0.00017275501775814182, "loss": 1.8088, "step": 170 }, { "epoch": 0.005835345987389169, "grad_norm": 4.176562309265137, "learning_rate": 0.00016722096269620562, "loss": 1.7369, "step": 180 }, { "epoch": 0.006159531875577456, "grad_norm": 3.4682700634002686, "learning_rate": 0.00016137646550922228, "loss": 1.6268, "step": 190 }, { "epoch": 0.006483717763765743, "grad_norm": 4.2149200439453125, "learning_rate": 0.00015525, "loss": 1.6822, "step": 200 }, { "epoch": 0.006483717763765743, "eval_loss": 1.7772059440612793, "eval_runtime": 397.9979, "eval_samples_per_second": 32.633, "eval_steps_per_second": 8.158, "step": 200 }, { "epoch": 0.00680790365195403, "grad_norm": 2.8696839809417725, "learning_rate": 0.0001488714136926695, "loss": 1.8906, "step": 210 }, { "epoch": 0.0071320895401423175, "grad_norm": 3.76540207862854, "learning_rate": 0.0001422717824185469, "loss": 1.7974, "step": 220 }, { "epoch": 0.0074562754283306045, "grad_norm": 4.557528972625732, "learning_rate": 0.00013548325891780705, "loss": 1.8021, "step": 230 }, { "epoch": 0.007780461316518892, "grad_norm": 4.325812339782715, "learning_rate": 0.0001285389161945656, "loss": 1.7236, "step": 240 }, { "epoch": 0.00810464720470718, "grad_norm": 3.879993200302124, "learning_rate": 0.0001214725863885273, "loss": 1.7255, "step": 250 }, { "epoch": 0.00810464720470718, "eval_loss": 1.7434029579162598, "eval_runtime": 397.2014, "eval_samples_per_second": 32.699, "eval_steps_per_second": 8.175, "step": 250 }, { "epoch": 0.008428833092895466, "grad_norm": 3.901002883911133, "learning_rate": 0.00011431869594820213, "loss": 1.7344, "step": 260 }, { "epoch": 0.008753018981083754, "grad_norm": 4.044349670410156, "learning_rate": 0.00010711209790870886, "loss": 1.576, "step": 270 }, { "epoch": 0.00907720486927204, "grad_norm": 3.8595011234283447, "learning_rate": 9.988790209129117e-05, "loss": 1.802, "step": 280 }, { "epoch": 0.009401390757460328, "grad_norm": 3.7126569747924805, "learning_rate": 9.268130405179787e-05, "loss": 1.6513, "step": 290 }, { "epoch": 0.009725576645648614, "grad_norm": 4.118965148925781, "learning_rate": 8.55274136114727e-05, "loss": 1.5466, "step": 300 }, { "epoch": 0.009725576645648614, "eval_loss": 1.6769126653671265, "eval_runtime": 399.0787, "eval_samples_per_second": 32.545, "eval_steps_per_second": 8.136, "step": 300 }, { "epoch": 0.010049762533836902, "grad_norm": 3.541172981262207, "learning_rate": 7.84610838054344e-05, "loss": 1.8639, "step": 310 }, { "epoch": 0.01037394842202519, "grad_norm": 4.0853753089904785, "learning_rate": 7.151674108219295e-05, "loss": 1.6957, "step": 320 }, { "epoch": 0.010698134310213476, "grad_norm": 3.440962076187134, "learning_rate": 6.472821758145309e-05, "loss": 1.6691, "step": 330 }, { "epoch": 0.011022320198401764, "grad_norm": 3.6532485485076904, "learning_rate": 5.8128586307330475e-05, "loss": 1.5867, "step": 340 }, { "epoch": 0.01134650608659005, "grad_norm": 3.979214906692505, "learning_rate": 5.175000000000002e-05, "loss": 1.5499, "step": 350 }, { "epoch": 0.01134650608659005, "eval_loss": 1.6310955286026, "eval_runtime": 398.0556, "eval_samples_per_second": 32.629, "eval_steps_per_second": 8.157, "step": 350 }, { "epoch": 0.011670691974778338, "grad_norm": 3.600574254989624, "learning_rate": 4.5623534490777714e-05, "loss": 1.8409, "step": 360 }, { "epoch": 0.011994877862966625, "grad_norm": 3.5220589637756348, "learning_rate": 3.9779037303794365e-05, "loss": 1.5889, "step": 370 }, { "epoch": 0.012319063751154912, "grad_norm": 3.7355663776397705, "learning_rate": 3.42449822418582e-05, "loss": 1.5124, "step": 380 }, { "epoch": 0.012643249639343199, "grad_norm": 3.8457112312316895, "learning_rate": 2.9048330664949622e-05, "loss": 1.5491, "step": 390 }, { "epoch": 0.012967435527531487, "grad_norm": 3.4412333965301514, "learning_rate": 2.4214400137185785e-05, "loss": 1.549, "step": 400 }, { "epoch": 0.012967435527531487, "eval_loss": 1.606103539466858, "eval_runtime": 396.0436, "eval_samples_per_second": 32.794, "eval_steps_per_second": 8.199, "step": 400 }, { "epoch": 0.013291621415719775, "grad_norm": 3.6701438426971436, "learning_rate": 1.976674108219295e-05, "loss": 1.6744, "step": 410 }, { "epoch": 0.01361580730390806, "grad_norm": 4.545055389404297, "learning_rate": 1.572702204780991e-05, "loss": 1.6089, "step": 420 }, { "epoch": 0.013939993192096349, "grad_norm": 3.4341204166412354, "learning_rate": 1.2114924139101056e-05, "loss": 1.5792, "step": 430 }, { "epoch": 0.014264179080284635, "grad_norm": 3.332812547683716, "learning_rate": 8.948045133990798e-06, "loss": 1.5303, "step": 440 }, { "epoch": 0.014588364968472923, "grad_norm": 3.597540855407715, "learning_rate": 6.241813748658489e-06, "loss": 1.5332, "step": 450 }, { "epoch": 0.014588364968472923, "eval_loss": 1.5877035856246948, "eval_runtime": 397.7506, "eval_samples_per_second": 32.654, "eval_steps_per_second": 8.163, "step": 450 }, { "epoch": 0.014912550856661209, "grad_norm": 3.2884576320648193, "learning_rate": 4.009414470383994e-06, "loss": 1.8572, "step": 460 }, { "epoch": 0.015236736744849497, "grad_norm": 3.6394126415252686, "learning_rate": 2.261723324051111e-06, "loss": 1.7304, "step": 470 }, { "epoch": 0.015560922633037783, "grad_norm": 2.9843626022338867, "learning_rate": 1.0072548852474675e-06, "loss": 1.631, "step": 480 }, { "epoch": 0.01588510852122607, "grad_norm": 3.780142068862915, "learning_rate": 2.5212079810819554e-07, "loss": 1.2314, "step": 490 }, { "epoch": 0.01620929440941436, "grad_norm": 3.420269012451172, "learning_rate": 0.0, "loss": 1.4883, "step": 500 }, { "epoch": 0.01620929440941436, "eval_loss": 1.5858772993087769, "eval_runtime": 398.3709, "eval_samples_per_second": 32.603, "eval_steps_per_second": 8.151, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.789971686378701e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }