|
{ |
|
"best_metric": 1.2338447570800781, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.6901311249137336, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013802622498274672, |
|
"eval_loss": 3.10593581199646, |
|
"eval_runtime": 18.6566, |
|
"eval_samples_per_second": 16.348, |
|
"eval_steps_per_second": 4.127, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013802622498274672, |
|
"grad_norm": 24.552059173583984, |
|
"learning_rate": 4.1400000000000003e-05, |
|
"loss": 5.1127, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027605244996549344, |
|
"grad_norm": 15.504034042358398, |
|
"learning_rate": 8.280000000000001e-05, |
|
"loss": 3.841, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.041407867494824016, |
|
"grad_norm": 14.46058177947998, |
|
"learning_rate": 0.00012419999999999998, |
|
"loss": 3.534, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05521048999309869, |
|
"grad_norm": 51.250701904296875, |
|
"learning_rate": 0.00016560000000000001, |
|
"loss": 3.9728, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06901311249137336, |
|
"grad_norm": 27.90164566040039, |
|
"learning_rate": 0.000207, |
|
"loss": 3.3681, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06901311249137336, |
|
"eval_loss": 2.3035168647766113, |
|
"eval_runtime": 18.6377, |
|
"eval_samples_per_second": 16.365, |
|
"eval_steps_per_second": 4.131, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08281573498964803, |
|
"grad_norm": 16.63316535949707, |
|
"learning_rate": 0.00020674787920189178, |
|
"loss": 4.045, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0966183574879227, |
|
"grad_norm": 14.355842590332031, |
|
"learning_rate": 0.00020599274511475253, |
|
"loss": 3.2498, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11042097998619738, |
|
"grad_norm": 14.458366394042969, |
|
"learning_rate": 0.00020473827667594888, |
|
"loss": 3.7108, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 16.983057022094727, |
|
"learning_rate": 0.00020299058552961598, |
|
"loss": 3.4946, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13802622498274672, |
|
"grad_norm": 30.41816520690918, |
|
"learning_rate": 0.00020075818625134152, |
|
"loss": 3.7154, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13802622498274672, |
|
"eval_loss": 2.316256523132324, |
|
"eval_runtime": 18.6116, |
|
"eval_samples_per_second": 16.388, |
|
"eval_steps_per_second": 4.137, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1518288474810214, |
|
"grad_norm": 69.45185852050781, |
|
"learning_rate": 0.00019805195486600916, |
|
"loss": 3.5852, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16563146997929606, |
|
"grad_norm": 14.340858459472656, |
|
"learning_rate": 0.00019488507586089894, |
|
"loss": 3.4744, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17943409247757075, |
|
"grad_norm": 28.43404197692871, |
|
"learning_rate": 0.00019127297795219008, |
|
"loss": 3.1552, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1932367149758454, |
|
"grad_norm": 18.572847366333008, |
|
"learning_rate": 0.00018723325891780706, |
|
"loss": 3.3675, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2070393374741201, |
|
"grad_norm": 26.13022804260254, |
|
"learning_rate": 0.0001827855998628142, |
|
"loss": 3.4886, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2070393374741201, |
|
"eval_loss": 1.9903616905212402, |
|
"eval_runtime": 18.6032, |
|
"eval_samples_per_second": 16.395, |
|
"eval_steps_per_second": 4.139, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22084195997239475, |
|
"grad_norm": 17.420490264892578, |
|
"learning_rate": 0.0001779516693350504, |
|
"loss": 3.5297, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23464458247066944, |
|
"grad_norm": 21.1903133392334, |
|
"learning_rate": 0.00017275501775814182, |
|
"loss": 3.1483, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 16.870092391967773, |
|
"learning_rate": 0.00016722096269620562, |
|
"loss": 2.9207, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.26224982746721875, |
|
"grad_norm": 19.400983810424805, |
|
"learning_rate": 0.00016137646550922228, |
|
"loss": 4.4511, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.27605244996549344, |
|
"grad_norm": 23.654308319091797, |
|
"learning_rate": 0.00015525, |
|
"loss": 2.9133, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27605244996549344, |
|
"eval_loss": 1.8559632301330566, |
|
"eval_runtime": 18.5949, |
|
"eval_samples_per_second": 16.402, |
|
"eval_steps_per_second": 4.141, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 27.395263671875, |
|
"learning_rate": 0.0001488714136926695, |
|
"loss": 3.4052, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3036576949620428, |
|
"grad_norm": 15.65333080291748, |
|
"learning_rate": 0.0001422717824185469, |
|
"loss": 3.1855, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 13.883142471313477, |
|
"learning_rate": 0.00013548325891780705, |
|
"loss": 3.1015, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.33126293995859213, |
|
"grad_norm": 20.504344940185547, |
|
"learning_rate": 0.0001285389161945656, |
|
"loss": 3.6529, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3450655624568668, |
|
"grad_norm": 37.432334899902344, |
|
"learning_rate": 0.0001214725863885273, |
|
"loss": 4.7279, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3450655624568668, |
|
"eval_loss": 1.8165959119796753, |
|
"eval_runtime": 18.6944, |
|
"eval_samples_per_second": 16.315, |
|
"eval_steps_per_second": 4.119, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3588681849551415, |
|
"grad_norm": 16.661788940429688, |
|
"learning_rate": 0.00011431869594820213, |
|
"loss": 3.6444, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 21.61427116394043, |
|
"learning_rate": 0.00010711209790870886, |
|
"loss": 2.8124, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3864734299516908, |
|
"grad_norm": 14.587434768676758, |
|
"learning_rate": 9.988790209129117e-05, |
|
"loss": 2.6168, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4002760524499655, |
|
"grad_norm": 10.792880058288574, |
|
"learning_rate": 9.268130405179787e-05, |
|
"loss": 2.8998, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4140786749482402, |
|
"grad_norm": 19.90591049194336, |
|
"learning_rate": 8.55274136114727e-05, |
|
"loss": 2.9482, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4140786749482402, |
|
"eval_loss": 1.5225303173065186, |
|
"eval_runtime": 18.6815, |
|
"eval_samples_per_second": 16.326, |
|
"eval_steps_per_second": 4.122, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4278812974465148, |
|
"grad_norm": 12.331941604614258, |
|
"learning_rate": 7.84610838054344e-05, |
|
"loss": 2.8716, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4416839199447895, |
|
"grad_norm": 12.893694877624512, |
|
"learning_rate": 7.151674108219295e-05, |
|
"loss": 2.9602, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4554865424430642, |
|
"grad_norm": 10.044930458068848, |
|
"learning_rate": 6.472821758145309e-05, |
|
"loss": 2.7228, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4692891649413389, |
|
"grad_norm": 10.728809356689453, |
|
"learning_rate": 5.8128586307330475e-05, |
|
"loss": 2.9921, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4830917874396135, |
|
"grad_norm": 20.39414405822754, |
|
"learning_rate": 5.175000000000002e-05, |
|
"loss": 2.815, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4830917874396135, |
|
"eval_loss": 1.4471914768218994, |
|
"eval_runtime": 18.6607, |
|
"eval_samples_per_second": 16.344, |
|
"eval_steps_per_second": 4.126, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 11.737767219543457, |
|
"learning_rate": 4.5623534490777714e-05, |
|
"loss": 2.9584, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5106970324361628, |
|
"grad_norm": 16.33255958557129, |
|
"learning_rate": 3.9779037303794365e-05, |
|
"loss": 2.6765, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5244996549344375, |
|
"grad_norm": 10.07257080078125, |
|
"learning_rate": 3.42449822418582e-05, |
|
"loss": 2.715, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5383022774327122, |
|
"grad_norm": 20.379079818725586, |
|
"learning_rate": 2.9048330664949622e-05, |
|
"loss": 2.5815, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5521048999309869, |
|
"grad_norm": 24.735910415649414, |
|
"learning_rate": 2.4214400137185785e-05, |
|
"loss": 2.5287, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5521048999309869, |
|
"eval_loss": 1.2898170948028564, |
|
"eval_runtime": 18.6447, |
|
"eval_samples_per_second": 16.359, |
|
"eval_steps_per_second": 4.13, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5659075224292616, |
|
"grad_norm": 10.461315155029297, |
|
"learning_rate": 1.976674108219295e-05, |
|
"loss": 2.38, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 12.781827926635742, |
|
"learning_rate": 1.572702204780991e-05, |
|
"loss": 2.5618, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5935127674258109, |
|
"grad_norm": 12.61215877532959, |
|
"learning_rate": 1.2114924139101056e-05, |
|
"loss": 2.1771, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6073153899240856, |
|
"grad_norm": 12.710418701171875, |
|
"learning_rate": 8.948045133990798e-06, |
|
"loss": 2.2393, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 20.297300338745117, |
|
"learning_rate": 6.241813748658489e-06, |
|
"loss": 2.4048, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"eval_loss": 1.241526484489441, |
|
"eval_runtime": 18.7361, |
|
"eval_samples_per_second": 16.279, |
|
"eval_steps_per_second": 4.11, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 13.001998901367188, |
|
"learning_rate": 4.009414470383994e-06, |
|
"loss": 2.4387, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6487232574189096, |
|
"grad_norm": 10.605116844177246, |
|
"learning_rate": 2.261723324051111e-06, |
|
"loss": 2.2201, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6625258799171843, |
|
"grad_norm": 8.887131690979004, |
|
"learning_rate": 1.0072548852474675e-06, |
|
"loss": 1.831, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6763285024154589, |
|
"grad_norm": 15.026272773742676, |
|
"learning_rate": 2.5212079810819554e-07, |
|
"loss": 2.6445, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6901311249137336, |
|
"grad_norm": 23.53835678100586, |
|
"learning_rate": 0.0, |
|
"loss": 2.4292, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6901311249137336, |
|
"eval_loss": 1.2338447570800781, |
|
"eval_runtime": 18.701, |
|
"eval_samples_per_second": 16.309, |
|
"eval_steps_per_second": 4.117, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.9437425893376e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|