|
{ |
|
"best_metric": 2.1577975749969482, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-600", |
|
"epoch": 0.061870021396715734, |
|
"eval_steps": 50, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00010311670232785956, |
|
"eval_loss": 4.732141971588135, |
|
"eval_runtime": 333.6338, |
|
"eval_samples_per_second": 48.958, |
|
"eval_steps_per_second": 12.241, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0010311670232785956, |
|
"grad_norm": 1.6720637083053589, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4657, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002062334046557191, |
|
"grad_norm": 1.9888806343078613, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 2.5398, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0030935010698357867, |
|
"grad_norm": 2.414046049118042, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 2.7624, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004124668093114382, |
|
"grad_norm": 3.4234137535095215, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 2.4618, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005155835116392978, |
|
"grad_norm": 5.409121513366699, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 2.4923, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005155835116392978, |
|
"eval_loss": 2.6833808422088623, |
|
"eval_runtime": 333.2547, |
|
"eval_samples_per_second": 49.014, |
|
"eval_steps_per_second": 12.255, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006187002139671573, |
|
"grad_norm": 0.715253472328186, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 1.8928, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007218169162950169, |
|
"grad_norm": 1.2599879503250122, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 2.3518, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008249336186228765, |
|
"grad_norm": 1.8510029315948486, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 2.6639, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00928050320950736, |
|
"grad_norm": 2.6438262462615967, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 2.3797, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.010311670232785956, |
|
"grad_norm": 4.056297302246094, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 2.3671, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.010311670232785956, |
|
"eval_loss": 2.5445327758789062, |
|
"eval_runtime": 333.7914, |
|
"eval_samples_per_second": 48.935, |
|
"eval_steps_per_second": 12.235, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011342837256064551, |
|
"grad_norm": 0.6714215278625488, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 1.8972, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.012374004279343147, |
|
"grad_norm": 1.4429042339324951, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 2.2593, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.013405171302621742, |
|
"grad_norm": 1.767311930656433, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 2.7356, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.014436338325900338, |
|
"grad_norm": 2.493459701538086, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 2.4359, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.015467505349178933, |
|
"grad_norm": 3.981605291366577, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 2.2622, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015467505349178933, |
|
"eval_loss": 2.3954179286956787, |
|
"eval_runtime": 334.3325, |
|
"eval_samples_per_second": 48.856, |
|
"eval_steps_per_second": 12.215, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01649867237245753, |
|
"grad_norm": 0.597623348236084, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 1.8683, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.017529839395736123, |
|
"grad_norm": 1.2859327793121338, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 2.1469, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.01856100641901472, |
|
"grad_norm": 1.5065349340438843, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 2.5904, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.019592173442293314, |
|
"grad_norm": 2.6285014152526855, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 2.2524, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02062334046557191, |
|
"grad_norm": 4.162689685821533, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 2.2073, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02062334046557191, |
|
"eval_loss": 2.3460395336151123, |
|
"eval_runtime": 334.257, |
|
"eval_samples_per_second": 48.867, |
|
"eval_steps_per_second": 12.218, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.021654507488850505, |
|
"grad_norm": 0.6041058301925659, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 1.8224, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.022685674512129102, |
|
"grad_norm": 1.3182774782180786, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 2.2141, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.023716841535407696, |
|
"grad_norm": 1.494461178779602, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 2.7371, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.024748008558686294, |
|
"grad_norm": 2.414562225341797, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 2.2599, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.025779175581964887, |
|
"grad_norm": 3.806607484817505, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 2.1804, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.025779175581964887, |
|
"eval_loss": 2.274019956588745, |
|
"eval_runtime": 333.7217, |
|
"eval_samples_per_second": 48.945, |
|
"eval_steps_per_second": 12.238, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.026810342605243485, |
|
"grad_norm": 0.5745736360549927, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 1.7065, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02784150962852208, |
|
"grad_norm": 1.216685175895691, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 2.2073, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.028872676651800676, |
|
"grad_norm": 1.3834314346313477, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 2.6267, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02990384367507927, |
|
"grad_norm": 2.0821757316589355, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 2.2572, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.030935010698357867, |
|
"grad_norm": 4.278872489929199, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 2.1662, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.030935010698357867, |
|
"eval_loss": 2.2416651248931885, |
|
"eval_runtime": 333.1876, |
|
"eval_samples_per_second": 49.023, |
|
"eval_steps_per_second": 12.257, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.031966177721636464, |
|
"grad_norm": 0.5629070997238159, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 1.7367, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03299734474491506, |
|
"grad_norm": 1.3552706241607666, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 2.135, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03402851176819365, |
|
"grad_norm": 1.389815330505371, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 2.669, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.035059678791472246, |
|
"grad_norm": 2.1797492504119873, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 2.1926, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.036090845814750847, |
|
"grad_norm": 3.770401954650879, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 2.1735, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.036090845814750847, |
|
"eval_loss": 2.213304042816162, |
|
"eval_runtime": 333.8272, |
|
"eval_samples_per_second": 48.93, |
|
"eval_steps_per_second": 12.234, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.03712201283802944, |
|
"grad_norm": 0.6478493213653564, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 1.7416, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.038153179861308034, |
|
"grad_norm": 1.3208801746368408, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 2.0733, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03918434688458663, |
|
"grad_norm": 1.4734785556793213, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 2.6356, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04021551390786523, |
|
"grad_norm": 1.8664650917053223, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 2.2411, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04124668093114382, |
|
"grad_norm": 3.517054796218872, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 2.0789, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04124668093114382, |
|
"eval_loss": 2.1788926124572754, |
|
"eval_runtime": 334.0731, |
|
"eval_samples_per_second": 48.893, |
|
"eval_steps_per_second": 12.225, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.042277847954422416, |
|
"grad_norm": 0.5625332593917847, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 1.7504, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.04330901497770101, |
|
"grad_norm": 1.5617921352386475, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 2.2538, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04434018200097961, |
|
"grad_norm": 1.5214532613754272, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 2.6096, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.045371349024258205, |
|
"grad_norm": 2.148418664932251, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 2.182, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0464025160475368, |
|
"grad_norm": 3.7270758152008057, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 2.1063, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0464025160475368, |
|
"eval_loss": 2.1697134971618652, |
|
"eval_runtime": 333.9603, |
|
"eval_samples_per_second": 48.91, |
|
"eval_steps_per_second": 12.229, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04743368307081539, |
|
"grad_norm": 0.498226135969162, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 1.756, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04846485009409399, |
|
"grad_norm": 1.1780967712402344, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 2.0653, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.04949601711737259, |
|
"grad_norm": 1.453464150428772, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 2.6079, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.05052718414065118, |
|
"grad_norm": 2.0182747840881348, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 2.1106, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.051558351163929775, |
|
"grad_norm": 3.5075230598449707, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 2.0581, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.051558351163929775, |
|
"eval_loss": 2.1618905067443848, |
|
"eval_runtime": 333.0388, |
|
"eval_samples_per_second": 49.045, |
|
"eval_steps_per_second": 12.263, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.052589518187208376, |
|
"grad_norm": 0.5434964895248413, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 1.77, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.05362068521048697, |
|
"grad_norm": 1.2367784976959229, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 2.1081, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.05465185223376556, |
|
"grad_norm": 1.470454454421997, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 2.5899, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05568301925704416, |
|
"grad_norm": 2.281672477722168, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 2.1905, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05671418628032276, |
|
"grad_norm": 3.330380916595459, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 2.074, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.05671418628032276, |
|
"eval_loss": 2.1590285301208496, |
|
"eval_runtime": 333.5902, |
|
"eval_samples_per_second": 48.964, |
|
"eval_steps_per_second": 12.243, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.05774535330360135, |
|
"grad_norm": 0.5410750508308411, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 1.7281, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.058776520326879945, |
|
"grad_norm": 1.255186915397644, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 2.0614, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05980768735015854, |
|
"grad_norm": 1.453993558883667, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 2.5846, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.06083885437343714, |
|
"grad_norm": 1.7940559387207031, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 2.2761, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.061870021396715734, |
|
"grad_norm": 3.636902093887329, |
|
"learning_rate": 0.0, |
|
"loss": 2.0926, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.061870021396715734, |
|
"eval_loss": 2.1577975749969482, |
|
"eval_runtime": 333.6083, |
|
"eval_samples_per_second": 48.962, |
|
"eval_steps_per_second": 12.242, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2563752110627226e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|