lesso07's picture
Training in progress, step 500, checkpoint
e131cbc verified
raw
history blame
12.1 kB
{
"best_metric": 1.5858772993087769,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.01620929440941436,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.241858881882872e-05,
"eval_loss": 2.906552791595459,
"eval_runtime": 397.3517,
"eval_samples_per_second": 32.686,
"eval_steps_per_second": 8.172,
"step": 1
},
{
"epoch": 0.0003241858881882872,
"grad_norm": 6.263719081878662,
"learning_rate": 4.1400000000000003e-05,
"loss": 2.7332,
"step": 10
},
{
"epoch": 0.0006483717763765744,
"grad_norm": 5.530974388122559,
"learning_rate": 8.280000000000001e-05,
"loss": 1.9108,
"step": 20
},
{
"epoch": 0.0009725576645648615,
"grad_norm": 3.9441070556640625,
"learning_rate": 0.00012419999999999998,
"loss": 1.8745,
"step": 30
},
{
"epoch": 0.0012967435527531487,
"grad_norm": 4.260671138763428,
"learning_rate": 0.00016560000000000001,
"loss": 1.7333,
"step": 40
},
{
"epoch": 0.0016209294409414358,
"grad_norm": 4.642955303192139,
"learning_rate": 0.000207,
"loss": 1.6797,
"step": 50
},
{
"epoch": 0.0016209294409414358,
"eval_loss": 1.8519707918167114,
"eval_runtime": 398.2732,
"eval_samples_per_second": 32.611,
"eval_steps_per_second": 8.153,
"step": 50
},
{
"epoch": 0.001945115329129723,
"grad_norm": 4.400924205780029,
"learning_rate": 0.00020674787920189178,
"loss": 1.868,
"step": 60
},
{
"epoch": 0.00226930121731801,
"grad_norm": 5.454312324523926,
"learning_rate": 0.00020599274511475253,
"loss": 2.0069,
"step": 70
},
{
"epoch": 0.0025934871055062975,
"grad_norm": 4.467811107635498,
"learning_rate": 0.00020473827667594888,
"loss": 1.7519,
"step": 80
},
{
"epoch": 0.0029176729936945846,
"grad_norm": 4.802621841430664,
"learning_rate": 0.00020299058552961598,
"loss": 1.6975,
"step": 90
},
{
"epoch": 0.0032418588818828717,
"grad_norm": 4.396114826202393,
"learning_rate": 0.00020075818625134152,
"loss": 1.6838,
"step": 100
},
{
"epoch": 0.0032418588818828717,
"eval_loss": 1.8345727920532227,
"eval_runtime": 398.358,
"eval_samples_per_second": 32.604,
"eval_steps_per_second": 8.151,
"step": 100
},
{
"epoch": 0.0035660447700711587,
"grad_norm": 3.908578872680664,
"learning_rate": 0.00019805195486600916,
"loss": 2.0062,
"step": 110
},
{
"epoch": 0.003890230658259446,
"grad_norm": 3.713029623031616,
"learning_rate": 0.00019488507586089894,
"loss": 1.7925,
"step": 120
},
{
"epoch": 0.004214416546447733,
"grad_norm": 4.107682704925537,
"learning_rate": 0.00019127297795219008,
"loss": 1.8635,
"step": 130
},
{
"epoch": 0.00453860243463602,
"grad_norm": 5.115502834320068,
"learning_rate": 0.00018723325891780706,
"loss": 1.8397,
"step": 140
},
{
"epoch": 0.004862788322824307,
"grad_norm": 4.171757221221924,
"learning_rate": 0.0001827855998628142,
"loss": 1.8328,
"step": 150
},
{
"epoch": 0.004862788322824307,
"eval_loss": 1.7966933250427246,
"eval_runtime": 397.1601,
"eval_samples_per_second": 32.702,
"eval_steps_per_second": 8.176,
"step": 150
},
{
"epoch": 0.005186974211012595,
"grad_norm": 3.389646053314209,
"learning_rate": 0.0001779516693350504,
"loss": 1.8123,
"step": 160
},
{
"epoch": 0.005511160099200882,
"grad_norm": 4.33231258392334,
"learning_rate": 0.00017275501775814182,
"loss": 1.8088,
"step": 170
},
{
"epoch": 0.005835345987389169,
"grad_norm": 4.176562309265137,
"learning_rate": 0.00016722096269620562,
"loss": 1.7369,
"step": 180
},
{
"epoch": 0.006159531875577456,
"grad_norm": 3.4682700634002686,
"learning_rate": 0.00016137646550922228,
"loss": 1.6268,
"step": 190
},
{
"epoch": 0.006483717763765743,
"grad_norm": 4.2149200439453125,
"learning_rate": 0.00015525,
"loss": 1.6822,
"step": 200
},
{
"epoch": 0.006483717763765743,
"eval_loss": 1.7772059440612793,
"eval_runtime": 397.9979,
"eval_samples_per_second": 32.633,
"eval_steps_per_second": 8.158,
"step": 200
},
{
"epoch": 0.00680790365195403,
"grad_norm": 2.8696839809417725,
"learning_rate": 0.0001488714136926695,
"loss": 1.8906,
"step": 210
},
{
"epoch": 0.0071320895401423175,
"grad_norm": 3.76540207862854,
"learning_rate": 0.0001422717824185469,
"loss": 1.7974,
"step": 220
},
{
"epoch": 0.0074562754283306045,
"grad_norm": 4.557528972625732,
"learning_rate": 0.00013548325891780705,
"loss": 1.8021,
"step": 230
},
{
"epoch": 0.007780461316518892,
"grad_norm": 4.325812339782715,
"learning_rate": 0.0001285389161945656,
"loss": 1.7236,
"step": 240
},
{
"epoch": 0.00810464720470718,
"grad_norm": 3.879993200302124,
"learning_rate": 0.0001214725863885273,
"loss": 1.7255,
"step": 250
},
{
"epoch": 0.00810464720470718,
"eval_loss": 1.7434029579162598,
"eval_runtime": 397.2014,
"eval_samples_per_second": 32.699,
"eval_steps_per_second": 8.175,
"step": 250
},
{
"epoch": 0.008428833092895466,
"grad_norm": 3.901002883911133,
"learning_rate": 0.00011431869594820213,
"loss": 1.7344,
"step": 260
},
{
"epoch": 0.008753018981083754,
"grad_norm": 4.044349670410156,
"learning_rate": 0.00010711209790870886,
"loss": 1.576,
"step": 270
},
{
"epoch": 0.00907720486927204,
"grad_norm": 3.8595011234283447,
"learning_rate": 9.988790209129117e-05,
"loss": 1.802,
"step": 280
},
{
"epoch": 0.009401390757460328,
"grad_norm": 3.7126569747924805,
"learning_rate": 9.268130405179787e-05,
"loss": 1.6513,
"step": 290
},
{
"epoch": 0.009725576645648614,
"grad_norm": 4.118965148925781,
"learning_rate": 8.55274136114727e-05,
"loss": 1.5466,
"step": 300
},
{
"epoch": 0.009725576645648614,
"eval_loss": 1.6769126653671265,
"eval_runtime": 399.0787,
"eval_samples_per_second": 32.545,
"eval_steps_per_second": 8.136,
"step": 300
},
{
"epoch": 0.010049762533836902,
"grad_norm": 3.541172981262207,
"learning_rate": 7.84610838054344e-05,
"loss": 1.8639,
"step": 310
},
{
"epoch": 0.01037394842202519,
"grad_norm": 4.0853753089904785,
"learning_rate": 7.151674108219295e-05,
"loss": 1.6957,
"step": 320
},
{
"epoch": 0.010698134310213476,
"grad_norm": 3.440962076187134,
"learning_rate": 6.472821758145309e-05,
"loss": 1.6691,
"step": 330
},
{
"epoch": 0.011022320198401764,
"grad_norm": 3.6532485485076904,
"learning_rate": 5.8128586307330475e-05,
"loss": 1.5867,
"step": 340
},
{
"epoch": 0.01134650608659005,
"grad_norm": 3.979214906692505,
"learning_rate": 5.175000000000002e-05,
"loss": 1.5499,
"step": 350
},
{
"epoch": 0.01134650608659005,
"eval_loss": 1.6310955286026,
"eval_runtime": 398.0556,
"eval_samples_per_second": 32.629,
"eval_steps_per_second": 8.157,
"step": 350
},
{
"epoch": 0.011670691974778338,
"grad_norm": 3.600574254989624,
"learning_rate": 4.5623534490777714e-05,
"loss": 1.8409,
"step": 360
},
{
"epoch": 0.011994877862966625,
"grad_norm": 3.5220589637756348,
"learning_rate": 3.9779037303794365e-05,
"loss": 1.5889,
"step": 370
},
{
"epoch": 0.012319063751154912,
"grad_norm": 3.7355663776397705,
"learning_rate": 3.42449822418582e-05,
"loss": 1.5124,
"step": 380
},
{
"epoch": 0.012643249639343199,
"grad_norm": 3.8457112312316895,
"learning_rate": 2.9048330664949622e-05,
"loss": 1.5491,
"step": 390
},
{
"epoch": 0.012967435527531487,
"grad_norm": 3.4412333965301514,
"learning_rate": 2.4214400137185785e-05,
"loss": 1.549,
"step": 400
},
{
"epoch": 0.012967435527531487,
"eval_loss": 1.606103539466858,
"eval_runtime": 396.0436,
"eval_samples_per_second": 32.794,
"eval_steps_per_second": 8.199,
"step": 400
},
{
"epoch": 0.013291621415719775,
"grad_norm": 3.6701438426971436,
"learning_rate": 1.976674108219295e-05,
"loss": 1.6744,
"step": 410
},
{
"epoch": 0.01361580730390806,
"grad_norm": 4.545055389404297,
"learning_rate": 1.572702204780991e-05,
"loss": 1.6089,
"step": 420
},
{
"epoch": 0.013939993192096349,
"grad_norm": 3.4341204166412354,
"learning_rate": 1.2114924139101056e-05,
"loss": 1.5792,
"step": 430
},
{
"epoch": 0.014264179080284635,
"grad_norm": 3.332812547683716,
"learning_rate": 8.948045133990798e-06,
"loss": 1.5303,
"step": 440
},
{
"epoch": 0.014588364968472923,
"grad_norm": 3.597540855407715,
"learning_rate": 6.241813748658489e-06,
"loss": 1.5332,
"step": 450
},
{
"epoch": 0.014588364968472923,
"eval_loss": 1.5877035856246948,
"eval_runtime": 397.7506,
"eval_samples_per_second": 32.654,
"eval_steps_per_second": 8.163,
"step": 450
},
{
"epoch": 0.014912550856661209,
"grad_norm": 3.2884576320648193,
"learning_rate": 4.009414470383994e-06,
"loss": 1.8572,
"step": 460
},
{
"epoch": 0.015236736744849497,
"grad_norm": 3.6394126415252686,
"learning_rate": 2.261723324051111e-06,
"loss": 1.7304,
"step": 470
},
{
"epoch": 0.015560922633037783,
"grad_norm": 2.9843626022338867,
"learning_rate": 1.0072548852474675e-06,
"loss": 1.631,
"step": 480
},
{
"epoch": 0.01588510852122607,
"grad_norm": 3.780142068862915,
"learning_rate": 2.5212079810819554e-07,
"loss": 1.2314,
"step": 490
},
{
"epoch": 0.01620929440941436,
"grad_norm": 3.420269012451172,
"learning_rate": 0.0,
"loss": 1.4883,
"step": 500
},
{
"epoch": 0.01620929440941436,
"eval_loss": 1.5858772993087769,
"eval_runtime": 398.3709,
"eval_samples_per_second": 32.603,
"eval_steps_per_second": 8.151,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.789971686378701e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}