|
{ |
|
"best_metric": 0.579126238822937, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-600", |
|
"epoch": 0.15643332029722332, |
|
"eval_steps": 50, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00026072220049537216, |
|
"eval_loss": 2.0383806228637695, |
|
"eval_runtime": 508.8695, |
|
"eval_samples_per_second": 12.695, |
|
"eval_steps_per_second": 3.174, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026072220049537216, |
|
"grad_norm": 14.43869400024414, |
|
"learning_rate": 0.0002, |
|
"loss": 4.3187, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005214444009907443, |
|
"grad_norm": 16.409202575683594, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 3.3223, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007821666014861166, |
|
"grad_norm": 17.23414421081543, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 3.5692, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010428888019814887, |
|
"grad_norm": 21.548076629638672, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 3.5142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013036110024768609, |
|
"grad_norm": 25.257431030273438, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 4.3892, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.013036110024768609, |
|
"eval_loss": 0.8825841546058655, |
|
"eval_runtime": 512.9624, |
|
"eval_samples_per_second": 12.594, |
|
"eval_steps_per_second": 3.148, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01564333202972233, |
|
"grad_norm": 13.57625961303711, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 3.0624, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.018250554034676052, |
|
"grad_norm": 13.32381820678711, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 2.9754, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.020857776039629773, |
|
"grad_norm": 14.278241157531738, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 3.3012, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.023464998044583497, |
|
"grad_norm": 50.32378387451172, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 3.306, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.026072220049537218, |
|
"grad_norm": 20.243038177490234, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 3.9649, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.026072220049537218, |
|
"eval_loss": 0.7839609384536743, |
|
"eval_runtime": 513.4664, |
|
"eval_samples_per_second": 12.581, |
|
"eval_steps_per_second": 3.145, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02867944205449094, |
|
"grad_norm": 13.500850677490234, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 2.5316, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03128666405944466, |
|
"grad_norm": 8.94965934753418, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 2.7797, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03389388606439838, |
|
"grad_norm": 15.162632942199707, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 3.0097, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.036501108069352105, |
|
"grad_norm": 15.734134674072266, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 2.8637, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03910833007430583, |
|
"grad_norm": 24.522480010986328, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 3.661, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03910833007430583, |
|
"eval_loss": 0.770531177520752, |
|
"eval_runtime": 513.902, |
|
"eval_samples_per_second": 12.57, |
|
"eval_steps_per_second": 3.143, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.041715552079259546, |
|
"grad_norm": 10.44023609161377, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 2.5952, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04432277408421327, |
|
"grad_norm": 13.444787979125977, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 2.6771, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.046929996089166995, |
|
"grad_norm": 15.03112506866455, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 3.0433, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04953721809412071, |
|
"grad_norm": 13.681419372558594, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 3.3614, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.052144440099074436, |
|
"grad_norm": 25.215314865112305, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 3.507, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.052144440099074436, |
|
"eval_loss": 0.7421609163284302, |
|
"eval_runtime": 513.024, |
|
"eval_samples_per_second": 12.592, |
|
"eval_steps_per_second": 3.148, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05475166210402816, |
|
"grad_norm": 9.96532917022705, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 2.2643, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05735888410898188, |
|
"grad_norm": 12.444780349731445, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 2.6161, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0599661061139356, |
|
"grad_norm": 13.275497436523438, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 2.9035, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06257332811888933, |
|
"grad_norm": 22.568071365356445, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 3.1491, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06518055012384305, |
|
"grad_norm": 18.894039154052734, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 3.6063, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06518055012384305, |
|
"eval_loss": 0.7025501132011414, |
|
"eval_runtime": 512.1493, |
|
"eval_samples_per_second": 12.614, |
|
"eval_steps_per_second": 3.153, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06778777212879676, |
|
"grad_norm": 9.569221496582031, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 2.3234, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07039499413375049, |
|
"grad_norm": 12.406023979187012, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 2.2466, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07300221613870421, |
|
"grad_norm": 14.062081336975098, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 2.8102, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07560943814365793, |
|
"grad_norm": 12.484160423278809, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 2.9179, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07821666014861166, |
|
"grad_norm": 19.24785804748535, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 3.4329, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07821666014861166, |
|
"eval_loss": 0.6793892979621887, |
|
"eval_runtime": 512.6051, |
|
"eval_samples_per_second": 12.602, |
|
"eval_steps_per_second": 3.151, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08082388215356538, |
|
"grad_norm": 7.278568744659424, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 2.2598, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08343110415851909, |
|
"grad_norm": 11.45936393737793, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 2.221, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08603832616347282, |
|
"grad_norm": 11.150221824645996, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 2.7018, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08864554816842654, |
|
"grad_norm": 14.54114055633545, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 2.7212, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09125277017338027, |
|
"grad_norm": 39.82421112060547, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 3.3754, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09125277017338027, |
|
"eval_loss": 0.6544287800788879, |
|
"eval_runtime": 512.4276, |
|
"eval_samples_per_second": 12.607, |
|
"eval_steps_per_second": 3.152, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09385999217833399, |
|
"grad_norm": 10.35319709777832, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 2.0052, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0964672141832877, |
|
"grad_norm": 11.164114952087402, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 2.3054, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09907443618824142, |
|
"grad_norm": 10.16714859008789, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 2.4965, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.10168165819319515, |
|
"grad_norm": 10.978752136230469, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 2.8824, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10428888019814887, |
|
"grad_norm": 19.6451473236084, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 3.0805, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10428888019814887, |
|
"eval_loss": 0.6194283962249756, |
|
"eval_runtime": 512.9361, |
|
"eval_samples_per_second": 12.594, |
|
"eval_steps_per_second": 3.149, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1068961022031026, |
|
"grad_norm": 8.461108207702637, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 1.9141, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10950332420805632, |
|
"grad_norm": 10.8994140625, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 2.2111, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11211054621301003, |
|
"grad_norm": 9.634496688842773, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 2.1574, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11471776821796376, |
|
"grad_norm": 12.223336219787598, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 2.7663, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11732499022291748, |
|
"grad_norm": 20.796768188476562, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 3.095, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11732499022291748, |
|
"eval_loss": 0.598429799079895, |
|
"eval_runtime": 512.9195, |
|
"eval_samples_per_second": 12.595, |
|
"eval_steps_per_second": 3.149, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1199322122278712, |
|
"grad_norm": 7.045400619506836, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 1.9138, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.12253943423282493, |
|
"grad_norm": 8.932710647583008, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 2.1108, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12514665623777865, |
|
"grad_norm": 14.591238975524902, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 2.394, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12775387824273238, |
|
"grad_norm": 11.373315811157227, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 2.4359, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1303611002476861, |
|
"grad_norm": 18.106515884399414, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 2.9492, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1303611002476861, |
|
"eval_loss": 0.5898227095603943, |
|
"eval_runtime": 514.4469, |
|
"eval_samples_per_second": 12.557, |
|
"eval_steps_per_second": 3.139, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13296832225263983, |
|
"grad_norm": 9.076396942138672, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 2.0251, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13557554425759352, |
|
"grad_norm": 11.412988662719727, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 2.2448, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13818276626254725, |
|
"grad_norm": 10.8418550491333, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 2.3559, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.14078998826750097, |
|
"grad_norm": 9.6925687789917, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 2.4988, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1433972102724547, |
|
"grad_norm": 18.714468002319336, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 2.8972, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1433972102724547, |
|
"eval_loss": 0.5796568989753723, |
|
"eval_runtime": 514.4013, |
|
"eval_samples_per_second": 12.558, |
|
"eval_steps_per_second": 3.14, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14600443227740842, |
|
"grad_norm": 8.010424613952637, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 1.7564, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14861165428236214, |
|
"grad_norm": 8.445106506347656, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 2.0401, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.15121887628731587, |
|
"grad_norm": 10.118414878845215, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 2.3152, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1538260982922696, |
|
"grad_norm": 11.436360359191895, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 2.4145, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15643332029722332, |
|
"grad_norm": 14.37455940246582, |
|
"learning_rate": 0.0, |
|
"loss": 2.9678, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15643332029722332, |
|
"eval_loss": 0.579126238822937, |
|
"eval_runtime": 512.7083, |
|
"eval_samples_per_second": 12.6, |
|
"eval_steps_per_second": 3.15, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.982119101916774e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|