|
{ |
|
"best_metric": 9.23099136352539, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-600", |
|
"epoch": 0.706090026478376, |
|
"eval_steps": 50, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011768167107972932, |
|
"eval_loss": 10.379207611083984, |
|
"eval_runtime": 3.1746, |
|
"eval_samples_per_second": 451.087, |
|
"eval_steps_per_second": 112.772, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011768167107972934, |
|
"grad_norm": 0.5737844705581665, |
|
"learning_rate": 0.0002, |
|
"loss": 10.3536, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023536334215945868, |
|
"grad_norm": 0.5810375809669495, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 10.3108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0353045013239188, |
|
"grad_norm": 0.5553200840950012, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 10.2622, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.047072668431891736, |
|
"grad_norm": 0.559855043888092, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 10.2096, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05884083553986467, |
|
"grad_norm": 0.5475898385047913, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 10.1577, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05884083553986467, |
|
"eval_loss": 10.128434181213379, |
|
"eval_runtime": 3.1442, |
|
"eval_samples_per_second": 455.443, |
|
"eval_steps_per_second": 113.861, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0706090026478376, |
|
"grad_norm": 3.1386730670928955, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 10.1068, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08237716975581054, |
|
"grad_norm": 0.5707929134368896, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 10.0549, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09414533686378347, |
|
"grad_norm": 0.5704991221427917, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 10.006, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1059135039717564, |
|
"grad_norm": 0.5538395643234253, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 9.9593, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11768167107972934, |
|
"grad_norm": 0.6376510858535767, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 9.912, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11768167107972934, |
|
"eval_loss": 9.88634204864502, |
|
"eval_runtime": 3.1338, |
|
"eval_samples_per_second": 456.959, |
|
"eval_steps_per_second": 114.24, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"grad_norm": 1.2404887676239014, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 9.8677, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1412180052956752, |
|
"grad_norm": 0.5645771622657776, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 9.8202, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15298617240364812, |
|
"grad_norm": 0.6336705088615417, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 9.7735, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16475433951162108, |
|
"grad_norm": 22.544200897216797, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 9.7257, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.176522506619594, |
|
"grad_norm": 1.2222896814346313, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 9.6954, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.176522506619594, |
|
"eval_loss": 9.663827896118164, |
|
"eval_runtime": 3.1098, |
|
"eval_samples_per_second": 460.483, |
|
"eval_steps_per_second": 115.121, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18829067372756694, |
|
"grad_norm": 0.5674024224281311, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 9.6526, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20005884083553988, |
|
"grad_norm": 0.5667808651924133, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 9.613, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2118270079435128, |
|
"grad_norm": 0.5394534468650818, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 9.5741, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22359517505148574, |
|
"grad_norm": 0.5556952357292175, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 9.5348, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23536334215945867, |
|
"grad_norm": 0.5671817064285278, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 9.4987, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23536334215945867, |
|
"eval_loss": 9.482871055603027, |
|
"eval_runtime": 3.1933, |
|
"eval_samples_per_second": 448.444, |
|
"eval_steps_per_second": 112.111, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2471315092674316, |
|
"grad_norm": 0.7693024277687073, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 9.4781, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"grad_norm": 3.6308693885803223, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 9.4655, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.27066784348337747, |
|
"grad_norm": 0.5690402388572693, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 9.4397, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2824360105913504, |
|
"grad_norm": 0.5628238916397095, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 9.4196, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29420417769932333, |
|
"grad_norm": 0.5627617835998535, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 9.3916, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.29420417769932333, |
|
"eval_loss": 9.378963470458984, |
|
"eval_runtime": 3.1537, |
|
"eval_samples_per_second": 454.067, |
|
"eval_steps_per_second": 113.517, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30597234480729624, |
|
"grad_norm": 0.5669119954109192, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 9.3743, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3177405119152692, |
|
"grad_norm": 0.5632140040397644, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 9.3444, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.32950867902324216, |
|
"grad_norm": 0.5589627623558044, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 9.3176, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34127684613121506, |
|
"grad_norm": 0.5277190208435059, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 9.2899, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.353045013239188, |
|
"grad_norm": 0.5792800784111023, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 9.2735, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.353045013239188, |
|
"eval_loss": 9.273290634155273, |
|
"eval_runtime": 3.1419, |
|
"eval_samples_per_second": 455.778, |
|
"eval_steps_per_second": 113.945, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3648131803471609, |
|
"grad_norm": 0.5586740374565125, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 9.2749, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3765813474551339, |
|
"grad_norm": 0.5625619292259216, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 9.2613, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 0.5586420893669128, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 9.2536, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40011768167107975, |
|
"grad_norm": 0.5391138195991516, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 9.2467, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41188584877905265, |
|
"grad_norm": 0.5625888705253601, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 9.2363, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.41188584877905265, |
|
"eval_loss": 9.241024017333984, |
|
"eval_runtime": 3.1378, |
|
"eval_samples_per_second": 456.375, |
|
"eval_steps_per_second": 114.094, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4236540158870256, |
|
"grad_norm": 0.5664399862289429, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 9.2488, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4354221829949985, |
|
"grad_norm": 0.5664339661598206, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 9.2408, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4471903501029715, |
|
"grad_norm": 42.019798278808594, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 9.2368, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4589585172109444, |
|
"grad_norm": 0.5508390069007874, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 9.2308, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47072668431891734, |
|
"grad_norm": 0.5313293933868408, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 9.2265, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.47072668431891734, |
|
"eval_loss": 9.23282241821289, |
|
"eval_runtime": 3.1777, |
|
"eval_samples_per_second": 450.641, |
|
"eval_steps_per_second": 112.66, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48249485142689025, |
|
"grad_norm": 0.5782054662704468, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 9.2417, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4942630185348632, |
|
"grad_norm": 0.5586299896240234, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 9.2365, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5060311856428361, |
|
"grad_norm": 2.5999763011932373, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 9.2348, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"grad_norm": 0.5547754764556885, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 9.2269, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.529567519858782, |
|
"grad_norm": 0.5392950773239136, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 9.2234, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.529567519858782, |
|
"eval_loss": 9.231368064880371, |
|
"eval_runtime": 3.1373, |
|
"eval_samples_per_second": 456.443, |
|
"eval_steps_per_second": 114.111, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5413356869667549, |
|
"grad_norm": 0.5859640836715698, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 9.2368, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5531038540747278, |
|
"grad_norm": 0.5664204955101013, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 9.2346, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5648720211827007, |
|
"grad_norm": 0.5508027672767639, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 9.2328, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5766401882906738, |
|
"grad_norm": 0.5547236204147339, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 9.2273, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5884083553986467, |
|
"grad_norm": 0.5351949334144592, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 9.2272, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5884083553986467, |
|
"eval_loss": 9.230931282043457, |
|
"eval_runtime": 3.1481, |
|
"eval_samples_per_second": 454.871, |
|
"eval_steps_per_second": 113.718, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6001765225066196, |
|
"grad_norm": 0.5742395520210266, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 9.2388, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6119446896145925, |
|
"grad_norm": 0.5664268732070923, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 9.234, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6237128567225655, |
|
"grad_norm": 0.5586317777633667, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 9.231, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6354810238305384, |
|
"grad_norm": 0.5682714581489563, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 9.227, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"grad_norm": 0.5156871676445007, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 9.2246, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"eval_loss": 9.23094654083252, |
|
"eval_runtime": 3.1842, |
|
"eval_samples_per_second": 449.718, |
|
"eval_steps_per_second": 112.429, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6590173580464843, |
|
"grad_norm": 0.5625162124633789, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 9.2384, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6707855251544572, |
|
"grad_norm": 0.5742562413215637, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 9.2365, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6825536922624301, |
|
"grad_norm": 0.5625633001327515, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 9.2344, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.694321859370403, |
|
"grad_norm": 0.5586168766021729, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 9.2276, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.706090026478376, |
|
"grad_norm": 0.5523204207420349, |
|
"learning_rate": 0.0, |
|
"loss": 9.2266, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.706090026478376, |
|
"eval_loss": 9.23099136352539, |
|
"eval_runtime": 3.1361, |
|
"eval_samples_per_second": 456.613, |
|
"eval_steps_per_second": 114.153, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 128442497236992.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|