{ "best_metric": 1.2670997381210327, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.7475083056478405, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016611295681063123, "eval_loss": 2.1715543270111084, "eval_runtime": 69.5427, "eval_samples_per_second": 14.581, "eval_steps_per_second": 3.652, "step": 1 }, { "epoch": 0.016611295681063124, "grad_norm": 0.28691908717155457, "learning_rate": 0.0002, "loss": 1.8097, "step": 10 }, { "epoch": 0.03322259136212625, "grad_norm": 0.39086708426475525, "learning_rate": 0.0001998582695676762, "loss": 1.4881, "step": 20 }, { "epoch": 0.04983388704318937, "grad_norm": 0.44144389033317566, "learning_rate": 0.00019943348002101371, "loss": 1.3426, "step": 30 }, { "epoch": 0.0664451827242525, "grad_norm": 0.45453155040740967, "learning_rate": 0.00019872683547213446, "loss": 1.3136, "step": 40 }, { "epoch": 0.08305647840531562, "grad_norm": 0.6533491015434265, "learning_rate": 0.00019774033898178667, "loss": 1.592, "step": 50 }, { "epoch": 0.08305647840531562, "eval_loss": 1.4280692338943481, "eval_runtime": 70.7159, "eval_samples_per_second": 14.339, "eval_steps_per_second": 3.592, "step": 50 }, { "epoch": 0.09966777408637874, "grad_norm": 0.2205391824245453, "learning_rate": 0.0001964767868814516, "loss": 1.4082, "step": 60 }, { "epoch": 0.11627906976744186, "grad_norm": 0.257103830575943, "learning_rate": 0.00019493976084683813, "loss": 1.2586, "step": 70 }, { "epoch": 0.132890365448505, "grad_norm": 0.2941131293773651, "learning_rate": 0.00019313361774523385, "loss": 1.2236, "step": 80 }, { "epoch": 0.14950166112956811, "grad_norm": 0.4034242331981659, "learning_rate": 0.00019106347728549135, "loss": 1.2459, "step": 90 }, { "epoch": 0.16611295681063123, "grad_norm": 0.6419020891189575, "learning_rate": 0.00018873520750565718, "loss": 1.499, "step": 100 }, { "epoch": 0.16611295681063123, "eval_loss": 1.4120386838912964, "eval_runtime": 70.5417, "eval_samples_per_second": 14.374, "eval_steps_per_second": 3.601, "step": 100 }, { "epoch": 0.18272425249169436, "grad_norm": 0.28808289766311646, "learning_rate": 0.0001861554081393806, "loss": 1.336, "step": 110 }, { "epoch": 0.19933554817275748, "grad_norm": 0.29374128580093384, "learning_rate": 0.0001833313919082515, "loss": 1.2921, "step": 120 }, { "epoch": 0.2159468438538206, "grad_norm": 0.3089810907840729, "learning_rate": 0.00018027116379309638, "loss": 1.2395, "step": 130 }, { "epoch": 0.23255813953488372, "grad_norm": 0.43953055143356323, "learning_rate": 0.00017698339834299061, "loss": 1.1861, "step": 140 }, { "epoch": 0.24916943521594684, "grad_norm": 0.6231277585029602, "learning_rate": 0.00017347741508630672, "loss": 1.3598, "step": 150 }, { "epoch": 0.24916943521594684, "eval_loss": 1.4176186323165894, "eval_runtime": 70.5732, "eval_samples_per_second": 14.368, "eval_steps_per_second": 3.599, "step": 150 }, { "epoch": 0.26578073089701, "grad_norm": 0.3004702031612396, "learning_rate": 0.0001697631521134985, "loss": 1.4307, "step": 160 }, { "epoch": 0.2823920265780731, "grad_norm": 0.2584735155105591, "learning_rate": 0.00016585113790650388, "loss": 1.2259, "step": 170 }, { "epoch": 0.29900332225913623, "grad_norm": 0.30790308117866516, "learning_rate": 0.0001617524614946192, "loss": 1.1969, "step": 180 }, { "epoch": 0.31561461794019935, "grad_norm": 0.45509302616119385, "learning_rate": 0.0001574787410214407, "loss": 1.2463, "step": 190 }, { "epoch": 0.33222591362126247, "grad_norm": 0.5611400604248047, "learning_rate": 0.00015304209081197425, "loss": 1.4207, "step": 200 }, { "epoch": 0.33222591362126247, "eval_loss": 1.404697299003601, "eval_runtime": 70.4401, "eval_samples_per_second": 14.395, "eval_steps_per_second": 3.606, "step": 200 }, { "epoch": 0.3488372093023256, "grad_norm": 0.27873846888542175, "learning_rate": 0.00014845508703326504, "loss": 1.3757, "step": 210 }, { "epoch": 0.3654485049833887, "grad_norm": 0.2696116268634796, "learning_rate": 0.00014373073204588556, "loss": 1.2607, "step": 220 }, { "epoch": 0.38205980066445183, "grad_norm": 0.31358882784843445, "learning_rate": 0.00013888241754733208, "loss": 1.2161, "step": 230 }, { "epoch": 0.39867109634551495, "grad_norm": 0.4030625820159912, "learning_rate": 0.00013392388661180303, "loss": 1.2135, "step": 240 }, { "epoch": 0.4152823920265781, "grad_norm": 0.4767776131629944, "learning_rate": 0.0001288691947339621, "loss": 1.3493, "step": 250 }, { "epoch": 0.4152823920265781, "eval_loss": 1.3333581686019897, "eval_runtime": 70.4196, "eval_samples_per_second": 14.399, "eval_steps_per_second": 3.607, "step": 250 }, { "epoch": 0.4318936877076412, "grad_norm": 0.2525186538696289, "learning_rate": 0.0001237326699871115, "loss": 1.312, "step": 260 }, { "epoch": 0.4485049833887043, "grad_norm": 0.2510424554347992, "learning_rate": 0.00011852887240871145, "loss": 1.2663, "step": 270 }, { "epoch": 0.46511627906976744, "grad_norm": 0.27276328206062317, "learning_rate": 0.00011327255272837221, "loss": 1.149, "step": 280 }, { "epoch": 0.48172757475083056, "grad_norm": 0.43301859498023987, "learning_rate": 0.00010797861055530831, "loss": 1.1877, "step": 290 }, { "epoch": 0.4983388704318937, "grad_norm": 0.5607919096946716, "learning_rate": 0.00010266205214377748, "loss": 1.3628, "step": 300 }, { "epoch": 0.4983388704318937, "eval_loss": 1.299776554107666, "eval_runtime": 70.6911, "eval_samples_per_second": 14.344, "eval_steps_per_second": 3.593, "step": 300 }, { "epoch": 0.5149501661129569, "grad_norm": 0.2536980211734772, "learning_rate": 9.733794785622253e-05, "loss": 1.365, "step": 310 }, { "epoch": 0.53156146179402, "grad_norm": 0.28358352184295654, "learning_rate": 9.202138944469168e-05, "loss": 1.2521, "step": 320 }, { "epoch": 0.5481727574750831, "grad_norm": 0.2929559350013733, "learning_rate": 8.672744727162781e-05, "loss": 1.2241, "step": 330 }, { "epoch": 0.5647840531561462, "grad_norm": 0.4324483573436737, "learning_rate": 8.147112759128859e-05, "loss": 1.2225, "step": 340 }, { "epoch": 0.5813953488372093, "grad_norm": 0.47393572330474854, "learning_rate": 7.626733001288851e-05, "loss": 1.3531, "step": 350 }, { "epoch": 0.5813953488372093, "eval_loss": 1.2868059873580933, "eval_runtime": 70.612, "eval_samples_per_second": 14.36, "eval_steps_per_second": 3.597, "step": 350 }, { "epoch": 0.5980066445182725, "grad_norm": 0.26292383670806885, "learning_rate": 7.113080526603792e-05, "loss": 1.3364, "step": 360 }, { "epoch": 0.6146179401993356, "grad_norm": 0.28483420610427856, "learning_rate": 6.607611338819697e-05, "loss": 1.2009, "step": 370 }, { "epoch": 0.6312292358803987, "grad_norm": 0.3586254119873047, "learning_rate": 6.111758245266794e-05, "loss": 1.2086, "step": 380 }, { "epoch": 0.6478405315614618, "grad_norm": 0.374376505613327, "learning_rate": 5.626926795411447e-05, "loss": 1.2017, "step": 390 }, { "epoch": 0.6644518272425249, "grad_norm": 0.546507716178894, "learning_rate": 5.1544912966734994e-05, "loss": 1.2866, "step": 400 }, { "epoch": 0.6644518272425249, "eval_loss": 1.272492527961731, "eval_runtime": 70.4532, "eval_samples_per_second": 14.393, "eval_steps_per_second": 3.605, "step": 400 }, { "epoch": 0.6810631229235881, "grad_norm": 0.2525825500488281, "learning_rate": 4.695790918802576e-05, "loss": 1.3349, "step": 410 }, { "epoch": 0.6976744186046512, "grad_norm": 0.2703811824321747, "learning_rate": 4.252125897855932e-05, "loss": 1.2055, "step": 420 }, { "epoch": 0.7142857142857143, "grad_norm": 0.2996411919593811, "learning_rate": 3.824753850538082e-05, "loss": 1.2155, "step": 430 }, { "epoch": 0.7308970099667774, "grad_norm": 0.3596253991127014, "learning_rate": 3.414886209349615e-05, "loss": 1.1916, "step": 440 }, { "epoch": 0.7475083056478405, "grad_norm": 0.48794233798980713, "learning_rate": 3.0236847886501542e-05, "loss": 1.3421, "step": 450 }, { "epoch": 0.7475083056478405, "eval_loss": 1.2670997381210327, "eval_runtime": 70.6313, "eval_samples_per_second": 14.356, "eval_steps_per_second": 3.596, "step": 450 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.7504888703156224e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }