{ "best_metric": 11.879236221313477, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.3941663381947182, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007883326763894363, "eval_loss": 11.92764663696289, "eval_runtime": 7.1099, "eval_samples_per_second": 75.247, "eval_steps_per_second": 18.847, "step": 1 }, { "epoch": 0.007883326763894364, "grad_norm": 0.07368365675210953, "learning_rate": 4.02e-05, "loss": 11.9274, "step": 10 }, { "epoch": 0.015766653527788728, "grad_norm": 0.07584132999181747, "learning_rate": 8.04e-05, "loss": 11.9287, "step": 20 }, { "epoch": 0.023649980291683092, "grad_norm": 0.09191010147333145, "learning_rate": 0.0001206, "loss": 11.9281, "step": 30 }, { "epoch": 0.031533307055577456, "grad_norm": 0.16363732516765594, "learning_rate": 0.0001608, "loss": 11.93, "step": 40 }, { "epoch": 0.039416633819471816, "grad_norm": 0.2393222153186798, "learning_rate": 0.000201, "loss": 11.9072, "step": 50 }, { "epoch": 0.039416633819471816, "eval_loss": 11.909907341003418, "eval_runtime": 6.7429, "eval_samples_per_second": 79.343, "eval_steps_per_second": 19.873, "step": 50 }, { "epoch": 0.047299960583366184, "grad_norm": 0.14083291590213776, "learning_rate": 0.00020075518705111234, "loss": 11.9181, "step": 60 }, { "epoch": 0.055183287347260544, "grad_norm": 0.1786942332983017, "learning_rate": 0.00020002194090852784, "loss": 11.9084, "step": 70 }, { "epoch": 0.06306661411115491, "grad_norm": 0.3063865900039673, "learning_rate": 0.00019880383387374748, "loss": 11.8998, "step": 80 }, { "epoch": 0.07094994087504927, "grad_norm": 0.29278698563575745, "learning_rate": 0.00019710680044180106, "loss": 11.8871, "step": 90 }, { "epoch": 0.07883326763894363, "grad_norm": 0.24070513248443604, "learning_rate": 0.0001949391083889838, "loss": 11.8757, "step": 100 }, { "epoch": 0.07883326763894363, "eval_loss": 11.89173698425293, "eval_runtime": 6.9854, "eval_samples_per_second": 76.589, "eval_steps_per_second": 19.183, "step": 100 }, { "epoch": 0.086716594402838, "grad_norm": 0.11940794438123703, "learning_rate": 0.00019231131849308138, "loss": 11.9059, "step": 110 }, { "epoch": 0.09459992116673237, "grad_norm": 0.11173859983682632, "learning_rate": 0.00018923623308232218, "loss": 11.8996, "step": 120 }, { "epoch": 0.10248324793062673, "grad_norm": 0.18088971078395844, "learning_rate": 0.00018572883366372081, "loss": 11.8945, "step": 130 }, { "epoch": 0.11036657469452109, "grad_norm": 0.2551112771034241, "learning_rate": 0.00018180620793468224, "loss": 11.8876, "step": 140 }, { "epoch": 0.11824990145841545, "grad_norm": 0.24974049627780914, "learning_rate": 0.00017748746653345728, "loss": 11.8724, "step": 150 }, { "epoch": 0.11824990145841545, "eval_loss": 11.8869047164917, "eval_runtime": 7.2467, "eval_samples_per_second": 73.826, "eval_steps_per_second": 18.491, "step": 150 }, { "epoch": 0.12613322822230982, "grad_norm": 0.10702640563249588, "learning_rate": 0.00017279364993403443, "loss": 11.9001, "step": 160 }, { "epoch": 0.13401655498620418, "grad_norm": 0.16987697780132294, "learning_rate": 0.00016774762593906525, "loss": 11.8924, "step": 170 }, { "epoch": 0.14189988175009854, "grad_norm": 0.19377273321151733, "learning_rate": 0.00016237397827022866, "loss": 11.8923, "step": 180 }, { "epoch": 0.1497832085139929, "grad_norm": 0.20443686842918396, "learning_rate": 0.00015669888679881007, "loss": 11.8788, "step": 190 }, { "epoch": 0.15766653527788727, "grad_norm": 0.28372666239738464, "learning_rate": 0.00015075, "loss": 11.8638, "step": 200 }, { "epoch": 0.15766653527788727, "eval_loss": 11.884162902832031, "eval_runtime": 7.0156, "eval_samples_per_second": 76.258, "eval_steps_per_second": 19.1, "step": 200 }, { "epoch": 0.16554986204178163, "grad_norm": 0.08742058277130127, "learning_rate": 0.00014455630025230227, "loss": 11.8971, "step": 210 }, { "epoch": 0.173433188805676, "grad_norm": 0.17662465572357178, "learning_rate": 0.00013814796263829918, "loss": 11.8937, "step": 220 }, { "epoch": 0.18131651556957035, "grad_norm": 0.21377001702785492, "learning_rate": 0.00013155620793468223, "loss": 11.8888, "step": 230 }, { "epoch": 0.18919984233346474, "grad_norm": 0.2640247941017151, "learning_rate": 0.0001248131505077666, "loss": 11.8752, "step": 240 }, { "epoch": 0.1970831690973591, "grad_norm": 0.3643070161342621, "learning_rate": 0.00011795164185552652, "loss": 11.8626, "step": 250 }, { "epoch": 0.1970831690973591, "eval_loss": 11.881659507751465, "eval_runtime": 6.7952, "eval_samples_per_second": 78.732, "eval_steps_per_second": 19.72, "step": 250 }, { "epoch": 0.20496649586125346, "grad_norm": 0.08927304297685623, "learning_rate": 0.00011100511055839919, "loss": 11.8957, "step": 260 }, { "epoch": 0.21284982262514782, "grad_norm": 0.1372813731431961, "learning_rate": 0.00010400739941860137, "loss": 11.8902, "step": 270 }, { "epoch": 0.22073314938904218, "grad_norm": 0.22779017686843872, "learning_rate": 9.699260058139868e-05, "loss": 11.8869, "step": 280 }, { "epoch": 0.22861647615293654, "grad_norm": 0.24806085228919983, "learning_rate": 8.999488944160085e-05, "loss": 11.8799, "step": 290 }, { "epoch": 0.2364998029168309, "grad_norm": 0.2623824179172516, "learning_rate": 8.30483581444735e-05, "loss": 11.8597, "step": 300 }, { "epoch": 0.2364998029168309, "eval_loss": 11.880826950073242, "eval_runtime": 7.0977, "eval_samples_per_second": 75.377, "eval_steps_per_second": 18.879, "step": 300 }, { "epoch": 0.24438312968072526, "grad_norm": 0.10820131003856659, "learning_rate": 7.618684949223341e-05, "loss": 11.8955, "step": 310 }, { "epoch": 0.25226645644461965, "grad_norm": 0.16028447449207306, "learning_rate": 6.94437920653178e-05, "loss": 11.8891, "step": 320 }, { "epoch": 0.260149783208514, "grad_norm": 0.21786223351955414, "learning_rate": 6.285203736170084e-05, "loss": 11.8818, "step": 330 }, { "epoch": 0.26803310997240837, "grad_norm": 0.2079205960035324, "learning_rate": 5.6443699747697714e-05, "loss": 11.8724, "step": 340 }, { "epoch": 0.27591643673630273, "grad_norm": 0.3210531771183014, "learning_rate": 5.025000000000002e-05, "loss": 11.8598, "step": 350 }, { "epoch": 0.27591643673630273, "eval_loss": 11.880062103271484, "eval_runtime": 7.0148, "eval_samples_per_second": 76.267, "eval_steps_per_second": 19.102, "step": 350 }, { "epoch": 0.2837997635001971, "grad_norm": 0.0896448940038681, "learning_rate": 4.430111320118996e-05, "loss": 11.8917, "step": 360 }, { "epoch": 0.29168309026409145, "grad_norm": 0.1952148824930191, "learning_rate": 3.862602172977134e-05, "loss": 11.8869, "step": 370 }, { "epoch": 0.2995664170279858, "grad_norm": 0.20794124901294708, "learning_rate": 3.325237406093478e-05, "loss": 11.8858, "step": 380 }, { "epoch": 0.30744974379188017, "grad_norm": 0.34835031628608704, "learning_rate": 2.820635006596558e-05, "loss": 11.871, "step": 390 }, { "epoch": 0.31533307055577453, "grad_norm": 0.43546953797340393, "learning_rate": 2.351253346654272e-05, "loss": 11.8622, "step": 400 }, { "epoch": 0.31533307055577453, "eval_loss": 11.879236221313477, "eval_runtime": 6.9599, "eval_samples_per_second": 76.868, "eval_steps_per_second": 19.253, "step": 400 }, { "epoch": 0.3232163973196689, "grad_norm": 0.10507825762033463, "learning_rate": 1.9193792065317794e-05, "loss": 11.8928, "step": 410 }, { "epoch": 0.33109972408356325, "grad_norm": 0.19336579740047455, "learning_rate": 1.5271166336279193e-05, "loss": 11.8913, "step": 420 }, { "epoch": 0.3389830508474576, "grad_norm": 0.21657152473926544, "learning_rate": 1.1763766917677837e-05, "loss": 11.8834, "step": 430 }, { "epoch": 0.346866377611352, "grad_norm": 0.3501337766647339, "learning_rate": 8.688681506918602e-06, "loss": 11.8705, "step": 440 }, { "epoch": 0.35474970437524633, "grad_norm": 0.3057042360305786, "learning_rate": 6.060891611016215e-06, "loss": 11.8628, "step": 450 }, { "epoch": 0.35474970437524633, "eval_loss": 11.879561424255371, "eval_runtime": 7.0964, "eval_samples_per_second": 75.39, "eval_steps_per_second": 18.883, "step": 450 }, { "epoch": 0.3626330311391407, "grad_norm": 0.10696567595005035, "learning_rate": 3.893199558198952e-06, "loss": 11.8918, "step": 460 }, { "epoch": 0.37051635790303505, "grad_norm": 0.20454297959804535, "learning_rate": 2.1961661262525285e-06, "loss": 11.8865, "step": 470 }, { "epoch": 0.37839968466692947, "grad_norm": 0.21538551151752472, "learning_rate": 9.780590914721787e-07, "loss": 11.8861, "step": 480 }, { "epoch": 0.38628301143082383, "grad_norm": 0.281142920255661, "learning_rate": 2.4481294888766817e-07, "loss": 11.8759, "step": 490 }, { "epoch": 0.3941663381947182, "grad_norm": 0.4691304564476013, "learning_rate": 0.0, "loss": 11.8598, "step": 500 }, { "epoch": 0.3941663381947182, "eval_loss": 11.87945556640625, "eval_runtime": 7.1342, "eval_samples_per_second": 74.991, "eval_steps_per_second": 18.783, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 342278799360.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }