{ "best_metric": 1.0734155178070068, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.3838771593090211, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007677543186180423, "eval_loss": 1.699501633644104, "eval_runtime": 34.5919, "eval_samples_per_second": 15.871, "eval_steps_per_second": 3.989, "step": 1 }, { "epoch": 0.007677543186180422, "grad_norm": 1.66623854637146, "learning_rate": 4.1400000000000003e-05, "loss": 1.4758, "step": 10 }, { "epoch": 0.015355086372360844, "grad_norm": 1.6191164255142212, "learning_rate": 8.280000000000001e-05, "loss": 1.1966, "step": 20 }, { "epoch": 0.023032629558541268, "grad_norm": 1.8581939935684204, "learning_rate": 0.00012419999999999998, "loss": 1.1972, "step": 30 }, { "epoch": 0.030710172744721688, "grad_norm": 2.0947399139404297, "learning_rate": 0.00016560000000000001, "loss": 1.1415, "step": 40 }, { "epoch": 0.03838771593090211, "grad_norm": 3.375594139099121, "learning_rate": 0.000207, "loss": 1.1509, "step": 50 }, { "epoch": 0.03838771593090211, "eval_loss": 1.1970173120498657, "eval_runtime": 34.6084, "eval_samples_per_second": 15.863, "eval_steps_per_second": 3.987, "step": 50 }, { "epoch": 0.046065259117082535, "grad_norm": 1.4098953008651733, "learning_rate": 0.00020674787920189178, "loss": 1.0217, "step": 60 }, { "epoch": 0.053742802303262956, "grad_norm": 1.2954459190368652, "learning_rate": 0.00020599274511475253, "loss": 1.196, "step": 70 }, { "epoch": 0.061420345489443376, "grad_norm": 1.554813265800476, "learning_rate": 0.00020473827667594888, "loss": 1.1842, "step": 80 }, { "epoch": 0.0690978886756238, "grad_norm": 1.6619945764541626, "learning_rate": 0.00020299058552961598, "loss": 1.1747, "step": 90 }, { "epoch": 0.07677543186180422, "grad_norm": 5.735137462615967, "learning_rate": 0.00020075818625134152, "loss": 1.1506, "step": 100 }, { "epoch": 0.07677543186180422, "eval_loss": 1.270753026008606, "eval_runtime": 34.5609, "eval_samples_per_second": 15.885, "eval_steps_per_second": 3.993, "step": 100 }, { "epoch": 0.08445297504798464, "grad_norm": 1.7361294031143188, "learning_rate": 0.00019805195486600916, "loss": 1.0687, "step": 110 }, { "epoch": 0.09213051823416507, "grad_norm": 1.5904954671859741, "learning_rate": 0.00019488507586089894, "loss": 1.139, "step": 120 }, { "epoch": 0.09980806142034548, "grad_norm": 1.775456428527832, "learning_rate": 0.00019127297795219008, "loss": 1.1186, "step": 130 }, { "epoch": 0.10748560460652591, "grad_norm": 2.0062367916107178, "learning_rate": 0.00018723325891780706, "loss": 1.2057, "step": 140 }, { "epoch": 0.11516314779270634, "grad_norm": 3.6948869228363037, "learning_rate": 0.0001827855998628142, "loss": 1.0385, "step": 150 }, { "epoch": 0.11516314779270634, "eval_loss": 1.1850098371505737, "eval_runtime": 34.5429, "eval_samples_per_second": 15.893, "eval_steps_per_second": 3.995, "step": 150 }, { "epoch": 0.12284069097888675, "grad_norm": 1.3466116189956665, "learning_rate": 0.0001779516693350504, "loss": 1.0089, "step": 160 }, { "epoch": 0.13051823416506717, "grad_norm": 1.5833830833435059, "learning_rate": 0.00017275501775814182, "loss": 1.183, "step": 170 }, { "epoch": 0.1381957773512476, "grad_norm": 1.7136775255203247, "learning_rate": 0.00016722096269620562, "loss": 1.1192, "step": 180 }, { "epoch": 0.14587332053742802, "grad_norm": 1.7458416223526, "learning_rate": 0.00016137646550922228, "loss": 1.1443, "step": 190 }, { "epoch": 0.15355086372360843, "grad_norm": 4.119271755218506, "learning_rate": 0.00015525, "loss": 1.1007, "step": 200 }, { "epoch": 0.15355086372360843, "eval_loss": 1.1791120767593384, "eval_runtime": 34.4288, "eval_samples_per_second": 15.946, "eval_steps_per_second": 4.008, "step": 200 }, { "epoch": 0.16122840690978887, "grad_norm": 1.5441445112228394, "learning_rate": 0.0001488714136926695, "loss": 0.987, "step": 210 }, { "epoch": 0.1689059500959693, "grad_norm": 1.489668846130371, "learning_rate": 0.0001422717824185469, "loss": 1.1229, "step": 220 }, { "epoch": 0.1765834932821497, "grad_norm": 1.63771653175354, "learning_rate": 0.00013548325891780705, "loss": 1.0888, "step": 230 }, { "epoch": 0.18426103646833014, "grad_norm": 1.9554612636566162, "learning_rate": 0.0001285389161945656, "loss": 1.1235, "step": 240 }, { "epoch": 0.19193857965451055, "grad_norm": 2.936532735824585, "learning_rate": 0.0001214725863885273, "loss": 1.1173, "step": 250 }, { "epoch": 0.19193857965451055, "eval_loss": 1.164772629737854, "eval_runtime": 34.6581, "eval_samples_per_second": 15.84, "eval_steps_per_second": 3.982, "step": 250 }, { "epoch": 0.19961612284069097, "grad_norm": 1.3022620677947998, "learning_rate": 0.00011431869594820213, "loss": 1.0618, "step": 260 }, { "epoch": 0.2072936660268714, "grad_norm": 1.5140739679336548, "learning_rate": 0.00010711209790870886, "loss": 1.0069, "step": 270 }, { "epoch": 0.21497120921305182, "grad_norm": 2.146951675415039, "learning_rate": 9.988790209129117e-05, "loss": 1.1379, "step": 280 }, { "epoch": 0.22264875239923224, "grad_norm": 1.9529863595962524, "learning_rate": 9.268130405179787e-05, "loss": 1.143, "step": 290 }, { "epoch": 0.23032629558541268, "grad_norm": 4.628264904022217, "learning_rate": 8.55274136114727e-05, "loss": 1.1066, "step": 300 }, { "epoch": 0.23032629558541268, "eval_loss": 1.1256804466247559, "eval_runtime": 34.7811, "eval_samples_per_second": 15.784, "eval_steps_per_second": 3.968, "step": 300 }, { "epoch": 0.2380038387715931, "grad_norm": 1.5281367301940918, "learning_rate": 7.84610838054344e-05, "loss": 1.0167, "step": 310 }, { "epoch": 0.2456813819577735, "grad_norm": 1.4944322109222412, "learning_rate": 7.151674108219295e-05, "loss": 1.0431, "step": 320 }, { "epoch": 0.2533589251439539, "grad_norm": 1.5290899276733398, "learning_rate": 6.472821758145309e-05, "loss": 1.0743, "step": 330 }, { "epoch": 0.26103646833013433, "grad_norm": 1.7399842739105225, "learning_rate": 5.8128586307330475e-05, "loss": 1.1366, "step": 340 }, { "epoch": 0.2687140115163148, "grad_norm": 3.488797903060913, "learning_rate": 5.175000000000002e-05, "loss": 1.106, "step": 350 }, { "epoch": 0.2687140115163148, "eval_loss": 1.0944420099258423, "eval_runtime": 34.5813, "eval_samples_per_second": 15.876, "eval_steps_per_second": 3.991, "step": 350 }, { "epoch": 0.2763915547024952, "grad_norm": 1.4888174533843994, "learning_rate": 4.5623534490777714e-05, "loss": 0.9956, "step": 360 }, { "epoch": 0.2840690978886756, "grad_norm": 1.6224281787872314, "learning_rate": 3.9779037303794365e-05, "loss": 1.1409, "step": 370 }, { "epoch": 0.29174664107485604, "grad_norm": 1.536541223526001, "learning_rate": 3.42449822418582e-05, "loss": 1.1059, "step": 380 }, { "epoch": 0.29942418426103645, "grad_norm": 1.5598728656768799, "learning_rate": 2.9048330664949622e-05, "loss": 1.0613, "step": 390 }, { "epoch": 0.30710172744721687, "grad_norm": 3.250027894973755, "learning_rate": 2.4214400137185785e-05, "loss": 1.0349, "step": 400 }, { "epoch": 0.30710172744721687, "eval_loss": 1.0797396898269653, "eval_runtime": 34.5561, "eval_samples_per_second": 15.887, "eval_steps_per_second": 3.994, "step": 400 }, { "epoch": 0.31477927063339733, "grad_norm": 1.4682611227035522, "learning_rate": 1.976674108219295e-05, "loss": 1.0682, "step": 410 }, { "epoch": 0.32245681381957775, "grad_norm": 1.593374490737915, "learning_rate": 1.572702204780991e-05, "loss": 0.9887, "step": 420 }, { "epoch": 0.33013435700575816, "grad_norm": 1.7837949991226196, "learning_rate": 1.2114924139101056e-05, "loss": 1.0989, "step": 430 }, { "epoch": 0.3378119001919386, "grad_norm": 1.7039244174957275, "learning_rate": 8.948045133990798e-06, "loss": 1.0906, "step": 440 }, { "epoch": 0.345489443378119, "grad_norm": 4.831996440887451, "learning_rate": 6.241813748658489e-06, "loss": 1.0559, "step": 450 }, { "epoch": 0.345489443378119, "eval_loss": 1.0759239196777344, "eval_runtime": 34.4862, "eval_samples_per_second": 15.919, "eval_steps_per_second": 4.002, "step": 450 }, { "epoch": 0.3531669865642994, "grad_norm": 1.207382082939148, "learning_rate": 4.009414470383994e-06, "loss": 0.9927, "step": 460 }, { "epoch": 0.36084452975047987, "grad_norm": 1.4170714616775513, "learning_rate": 2.261723324051111e-06, "loss": 1.037, "step": 470 }, { "epoch": 0.3685220729366603, "grad_norm": 1.47847580909729, "learning_rate": 1.0072548852474675e-06, "loss": 1.0271, "step": 480 }, { "epoch": 0.3761996161228407, "grad_norm": 1.718285083770752, "learning_rate": 2.5212079810819554e-07, "loss": 1.0511, "step": 490 }, { "epoch": 0.3838771593090211, "grad_norm": 3.448530435562134, "learning_rate": 0.0, "loss": 1.0046, "step": 500 }, { "epoch": 0.3838771593090211, "eval_loss": 1.0734155178070068, "eval_runtime": 34.6444, "eval_samples_per_second": 15.847, "eval_steps_per_second": 3.983, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.729912582910771e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }