{ "best_metric": 1.1944663524627686, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.08488964346349745, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016977928692699492, "eval_loss": 1.4329637289047241, "eval_runtime": 234.9554, "eval_samples_per_second": 5.278, "eval_steps_per_second": 1.319, "step": 1 }, { "epoch": 0.001697792869269949, "grad_norm": 1.341242790222168, "learning_rate": 4.12e-05, "loss": 1.4209, "step": 10 }, { "epoch": 0.003395585738539898, "grad_norm": 0.7306675910949707, "learning_rate": 8.24e-05, "loss": 1.338, "step": 20 }, { "epoch": 0.0050933786078098476, "grad_norm": 0.7428624629974365, "learning_rate": 0.0001236, "loss": 1.2576, "step": 30 }, { "epoch": 0.006791171477079796, "grad_norm": 0.7021359205245972, "learning_rate": 0.0001648, "loss": 1.2015, "step": 40 }, { "epoch": 0.008488964346349746, "grad_norm": 0.8103872537612915, "learning_rate": 0.000206, "loss": 1.2281, "step": 50 }, { "epoch": 0.008488964346349746, "eval_loss": 1.2455830574035645, "eval_runtime": 237.2217, "eval_samples_per_second": 5.227, "eval_steps_per_second": 1.307, "step": 50 }, { "epoch": 0.010186757215619695, "grad_norm": 0.7842703461647034, "learning_rate": 0.0002057490971767619, "loss": 1.2212, "step": 60 }, { "epoch": 0.011884550084889643, "grad_norm": 0.8670001029968262, "learning_rate": 0.00020499761108038175, "loss": 1.2051, "step": 70 }, { "epoch": 0.013582342954159592, "grad_norm": 0.8176340460777283, "learning_rate": 0.00020374920287558198, "loss": 1.209, "step": 80 }, { "epoch": 0.015280135823429542, "grad_norm": 1.0117417573928833, "learning_rate": 0.00020200995468164684, "loss": 1.2546, "step": 90 }, { "epoch": 0.01697792869269949, "grad_norm": 1.4239269495010376, "learning_rate": 0.00019978833994094855, "loss": 1.3016, "step": 100 }, { "epoch": 0.01697792869269949, "eval_loss": 1.3394960165023804, "eval_runtime": 236.5162, "eval_samples_per_second": 5.243, "eval_steps_per_second": 1.311, "step": 100 }, { "epoch": 0.01867572156196944, "grad_norm": 0.6853418350219727, "learning_rate": 0.00019709518213718787, "loss": 1.3013, "step": 110 }, { "epoch": 0.02037351443123939, "grad_norm": 0.7583730816841125, "learning_rate": 0.00019394360206446948, "loss": 1.1855, "step": 120 }, { "epoch": 0.022071307300509338, "grad_norm": 0.8353321552276611, "learning_rate": 0.00019034895390411186, "loss": 1.2214, "step": 130 }, { "epoch": 0.023769100169779286, "grad_norm": 0.8519904613494873, "learning_rate": 0.0001863287504206196, "loss": 1.2907, "step": 140 }, { "epoch": 0.025466893039049237, "grad_norm": 0.7282483577728271, "learning_rate": 0.00018190257764125471, "loss": 1.1985, "step": 150 }, { "epoch": 0.025466893039049237, "eval_loss": 1.2455435991287231, "eval_runtime": 237.1965, "eval_samples_per_second": 5.228, "eval_steps_per_second": 1.307, "step": 150 }, { "epoch": 0.027164685908319185, "grad_norm": 0.8496482968330383, "learning_rate": 0.00017709199943488106, "loss": 1.1761, "step": 160 }, { "epoch": 0.028862478777589132, "grad_norm": 0.7892032861709595, "learning_rate": 0.00017192045245496238, "loss": 1.2281, "step": 170 }, { "epoch": 0.030560271646859084, "grad_norm": 0.8588133454322815, "learning_rate": 0.00016641313195854277, "loss": 1.205, "step": 180 }, { "epoch": 0.03225806451612903, "grad_norm": 0.9884034991264343, "learning_rate": 0.0001605968690574869, "loss": 1.2602, "step": 190 }, { "epoch": 0.03395585738539898, "grad_norm": 1.295212745666504, "learning_rate": 0.0001545, "loss": 1.2808, "step": 200 }, { "epoch": 0.03395585738539898, "eval_loss": 1.3785539865493774, "eval_runtime": 236.0337, "eval_samples_per_second": 5.253, "eval_steps_per_second": 1.313, "step": 200 }, { "epoch": 0.035653650254668934, "grad_norm": 0.7312771081924438, "learning_rate": 0.00014815222811927496, "loss": 1.249, "step": 210 }, { "epoch": 0.03735144312393888, "grad_norm": 0.748958170413971, "learning_rate": 0.00014158447912183896, "loss": 1.2374, "step": 220 }, { "epoch": 0.03904923599320883, "grad_norm": 0.7886843681335449, "learning_rate": 0.00013482875042061958, "loss": 1.2235, "step": 230 }, { "epoch": 0.04074702886247878, "grad_norm": 0.7367339134216309, "learning_rate": 0.00012791795524676576, "loss": 1.1971, "step": 240 }, { "epoch": 0.042444821731748725, "grad_norm": 0.8220826983451843, "learning_rate": 0.00012088576229969385, "loss": 1.2372, "step": 250 }, { "epoch": 0.042444821731748725, "eval_loss": 1.2279402017593384, "eval_runtime": 237.5615, "eval_samples_per_second": 5.22, "eval_steps_per_second": 1.305, "step": 250 }, { "epoch": 0.044142614601018676, "grad_norm": 0.7322437167167664, "learning_rate": 0.0001137664317165683, "loss": 1.2527, "step": 260 }, { "epoch": 0.04584040747028863, "grad_norm": 0.7987295985221863, "learning_rate": 0.00010659464816035761, "loss": 1.2151, "step": 270 }, { "epoch": 0.04753820033955857, "grad_norm": 0.8461410999298096, "learning_rate": 9.940535183964242e-05, "loss": 1.1581, "step": 280 }, { "epoch": 0.04923599320882852, "grad_norm": 0.893200159072876, "learning_rate": 9.22335682834317e-05, "loss": 1.1902, "step": 290 }, { "epoch": 0.050933786078098474, "grad_norm": 1.0908836126327515, "learning_rate": 8.511423770030617e-05, "loss": 1.285, "step": 300 }, { "epoch": 0.050933786078098474, "eval_loss": 1.2769778966903687, "eval_runtime": 235.1131, "eval_samples_per_second": 5.274, "eval_steps_per_second": 1.319, "step": 300 }, { "epoch": 0.05263157894736842, "grad_norm": 0.6881177425384521, "learning_rate": 7.808204475323423e-05, "loss": 1.2984, "step": 310 }, { "epoch": 0.05432937181663837, "grad_norm": 0.6913091540336609, "learning_rate": 7.117124957938042e-05, "loss": 1.1949, "step": 320 }, { "epoch": 0.05602716468590832, "grad_norm": 0.6359566450119019, "learning_rate": 6.441552087816105e-05, "loss": 1.2487, "step": 330 }, { "epoch": 0.057724957555178265, "grad_norm": 0.6509237289428711, "learning_rate": 5.784777188072502e-05, "loss": 1.1914, "step": 340 }, { "epoch": 0.059422750424448216, "grad_norm": 0.7074187994003296, "learning_rate": 5.150000000000002e-05, "loss": 1.1768, "step": 350 }, { "epoch": 0.059422750424448216, "eval_loss": 1.2067750692367554, "eval_runtime": 236.6964, "eval_samples_per_second": 5.239, "eval_steps_per_second": 1.31, "step": 350 }, { "epoch": 0.06112054329371817, "grad_norm": 0.7682424187660217, "learning_rate": 4.540313094251309e-05, "loss": 1.1792, "step": 360 }, { "epoch": 0.06281833616298811, "grad_norm": 0.7742196321487427, "learning_rate": 3.958686804145719e-05, "loss": 1.1434, "step": 370 }, { "epoch": 0.06451612903225806, "grad_norm": 0.8594280481338501, "learning_rate": 3.4079547545037634e-05, "loss": 1.1741, "step": 380 }, { "epoch": 0.06621392190152801, "grad_norm": 0.8601087331771851, "learning_rate": 2.8908000565118947e-05, "loss": 1.2038, "step": 390 }, { "epoch": 0.06791171477079797, "grad_norm": 1.167262077331543, "learning_rate": 2.4097422358745275e-05, "loss": 1.2147, "step": 400 }, { "epoch": 0.06791171477079797, "eval_loss": 1.2109880447387695, "eval_runtime": 235.5973, "eval_samples_per_second": 5.263, "eval_steps_per_second": 1.316, "step": 400 }, { "epoch": 0.06960950764006792, "grad_norm": 0.7046334743499756, "learning_rate": 1.9671249579380422e-05, "loss": 1.2294, "step": 410 }, { "epoch": 0.07130730050933787, "grad_norm": 0.6611829400062561, "learning_rate": 1.5651046095888127e-05, "loss": 1.1951, "step": 420 }, { "epoch": 0.0730050933786078, "grad_norm": 0.5956356525421143, "learning_rate": 1.205639793553052e-05, "loss": 1.1891, "step": 430 }, { "epoch": 0.07470288624787776, "grad_norm": 0.6220433115959167, "learning_rate": 8.904817862812098e-06, "loss": 1.1773, "step": 440 }, { "epoch": 0.07640067911714771, "grad_norm": 0.7181682586669922, "learning_rate": 6.211660059051443e-06, "loss": 1.2383, "step": 450 }, { "epoch": 0.07640067911714771, "eval_loss": 1.1962662935256958, "eval_runtime": 235.3359, "eval_samples_per_second": 5.269, "eval_steps_per_second": 1.317, "step": 450 }, { "epoch": 0.07809847198641766, "grad_norm": 0.7613552212715149, "learning_rate": 3.990045318353154e-06, "loss": 1.2263, "step": 460 }, { "epoch": 0.07979626485568761, "grad_norm": 0.7208029627799988, "learning_rate": 2.250797124418014e-06, "loss": 1.2039, "step": 470 }, { "epoch": 0.08149405772495756, "grad_norm": 0.8188151121139526, "learning_rate": 1.0023889196182526e-06, "loss": 1.2092, "step": 480 }, { "epoch": 0.0831918505942275, "grad_norm": 0.7839704751968384, "learning_rate": 2.5090282323810766e-07, "loss": 1.2678, "step": 490 }, { "epoch": 0.08488964346349745, "grad_norm": 1.197765588760376, "learning_rate": 0.0, "loss": 1.2776, "step": 500 }, { "epoch": 0.08488964346349745, "eval_loss": 1.1944663524627686, "eval_runtime": 235.1406, "eval_samples_per_second": 5.273, "eval_steps_per_second": 1.318, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.589424290959196e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }