|
{ |
|
"best_metric": 0.630431056022644, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.021073483236044084, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.214696647208817e-05, |
|
"eval_loss": 1.147399663925171, |
|
"eval_runtime": 871.891, |
|
"eval_samples_per_second": 11.458, |
|
"eval_steps_per_second": 2.865, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0004214696647208817, |
|
"grad_norm": 0.7626272439956665, |
|
"learning_rate": 4.34e-05, |
|
"loss": 1.0628, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0008429393294417634, |
|
"grad_norm": 0.4994406998157501, |
|
"learning_rate": 8.68e-05, |
|
"loss": 0.856, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.001264408994162645, |
|
"grad_norm": 0.5089827179908752, |
|
"learning_rate": 0.0001302, |
|
"loss": 0.7376, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0016858786588835268, |
|
"grad_norm": 0.6126852035522461, |
|
"learning_rate": 0.0001736, |
|
"loss": 0.7691, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0021073483236044087, |
|
"grad_norm": 1.0332469940185547, |
|
"learning_rate": 0.000217, |
|
"loss": 0.7831, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0021073483236044087, |
|
"eval_loss": 0.7892972230911255, |
|
"eval_runtime": 872.9595, |
|
"eval_samples_per_second": 11.444, |
|
"eval_steps_per_second": 2.862, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00252881798832529, |
|
"grad_norm": 0.43384820222854614, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 0.7715, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002950287653046172, |
|
"grad_norm": 0.4282907545566559, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 0.6761, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0033717573177670535, |
|
"grad_norm": 0.4396986663341522, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 0.6714, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0037932269824879354, |
|
"grad_norm": 0.5730768442153931, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 0.7565, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004214696647208817, |
|
"grad_norm": 0.9540401697158813, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 0.7393, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004214696647208817, |
|
"eval_loss": 0.7602397203445435, |
|
"eval_runtime": 872.7942, |
|
"eval_samples_per_second": 11.446, |
|
"eval_steps_per_second": 2.862, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004636166311929699, |
|
"grad_norm": 0.3995174765586853, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 0.7404, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00505763597665058, |
|
"grad_norm": 0.4123441278934479, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 0.6722, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005479105641371463, |
|
"grad_norm": 0.4265460968017578, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 0.6447, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005900575306092344, |
|
"grad_norm": 0.5936018228530884, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 0.7289, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006322044970813226, |
|
"grad_norm": 1.0116862058639526, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 0.7043, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006322044970813226, |
|
"eval_loss": 0.7188068628311157, |
|
"eval_runtime": 872.6433, |
|
"eval_samples_per_second": 11.448, |
|
"eval_steps_per_second": 2.863, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006743514635534107, |
|
"grad_norm": 0.4303455054759979, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 0.7164, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.007164984300254989, |
|
"grad_norm": 0.42676424980163574, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 0.6661, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.007586453964975871, |
|
"grad_norm": 0.44973692297935486, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 0.6661, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.008007923629696753, |
|
"grad_norm": 0.48232847452163696, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 0.6969, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.008429393294417635, |
|
"grad_norm": 0.9322578310966492, |
|
"learning_rate": 0.00016275, |
|
"loss": 0.6935, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.008429393294417635, |
|
"eval_loss": 0.7024884819984436, |
|
"eval_runtime": 872.9871, |
|
"eval_samples_per_second": 11.443, |
|
"eval_steps_per_second": 2.861, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.008850862959138516, |
|
"grad_norm": 0.42416146397590637, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 0.6823, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.009272332623859398, |
|
"grad_norm": 0.394248902797699, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 0.6591, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.009693802288580279, |
|
"grad_norm": 0.40907424688339233, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 0.6353, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.01011527195330116, |
|
"grad_norm": 0.49951237440109253, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 0.6632, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.010536741618022042, |
|
"grad_norm": 1.0042486190795898, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 0.6702, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.010536741618022042, |
|
"eval_loss": 0.6787152290344238, |
|
"eval_runtime": 872.8552, |
|
"eval_samples_per_second": 11.445, |
|
"eval_steps_per_second": 2.862, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.010958211282742925, |
|
"grad_norm": 0.4140114188194275, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 0.6869, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.011379680947463807, |
|
"grad_norm": 0.3980177640914917, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 0.618, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.011801150612184688, |
|
"grad_norm": 0.4262223243713379, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 0.6145, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01222262027690557, |
|
"grad_norm": 0.5590672492980957, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 0.6731, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.012644089941626451, |
|
"grad_norm": 0.9287014603614807, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 0.6508, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.012644089941626451, |
|
"eval_loss": 0.6602598428726196, |
|
"eval_runtime": 873.3259, |
|
"eval_samples_per_second": 11.439, |
|
"eval_steps_per_second": 2.86, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013065559606347333, |
|
"grad_norm": 0.41679131984710693, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 0.658, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.013487029271068214, |
|
"grad_norm": 0.4045983552932739, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 0.6414, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.013908498935789097, |
|
"grad_norm": 0.4106775224208832, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 0.6159, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.014329968600509979, |
|
"grad_norm": 0.5271579623222351, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 0.6481, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01475143826523086, |
|
"grad_norm": 0.8985204100608826, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 0.6159, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01475143826523086, |
|
"eval_loss": 0.6442619562149048, |
|
"eval_runtime": 873.0849, |
|
"eval_samples_per_second": 11.442, |
|
"eval_steps_per_second": 2.861, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.015172907929951742, |
|
"grad_norm": 0.41902175545692444, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 0.6228, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.015594377594672623, |
|
"grad_norm": 0.3683434724807739, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 0.6158, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.016015847259393506, |
|
"grad_norm": 0.4270426630973816, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 0.6093, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.016437316924114386, |
|
"grad_norm": 0.5241418480873108, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 0.662, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01685878658883527, |
|
"grad_norm": 0.7835712432861328, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 0.6354, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01685878658883527, |
|
"eval_loss": 0.635778546333313, |
|
"eval_runtime": 873.4484, |
|
"eval_samples_per_second": 11.437, |
|
"eval_steps_per_second": 2.86, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01728025625355615, |
|
"grad_norm": 0.4076845645904541, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 0.6454, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.017701725918277032, |
|
"grad_norm": 0.4060816466808319, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 0.6029, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.018123195582997912, |
|
"grad_norm": 0.43102189898490906, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 0.5974, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.018544665247718795, |
|
"grad_norm": 0.515192985534668, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 0.6454, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01896613491243968, |
|
"grad_norm": 0.9731060266494751, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 0.6225, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01896613491243968, |
|
"eval_loss": 0.6311845779418945, |
|
"eval_runtime": 873.5871, |
|
"eval_samples_per_second": 11.436, |
|
"eval_steps_per_second": 2.859, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.019387604577160558, |
|
"grad_norm": 0.38442152738571167, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 0.6438, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01980907424188144, |
|
"grad_norm": 0.38169050216674805, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 0.6287, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.02023054390660232, |
|
"grad_norm": 0.4052588641643524, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 0.6131, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.020652013571323204, |
|
"grad_norm": 0.47564056515693665, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 0.6782, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.021073483236044084, |
|
"grad_norm": 0.9834715127944946, |
|
"learning_rate": 0.0, |
|
"loss": 0.6781, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.021073483236044084, |
|
"eval_loss": 0.630431056022644, |
|
"eval_runtime": 874.2378, |
|
"eval_samples_per_second": 11.427, |
|
"eval_steps_per_second": 2.857, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3554433904345088e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|