lesso07's picture
Training in progress, step 500, checkpoint
86dffb8 verified
{
"best_metric": 1.2338447570800781,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.6901311249137336,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013802622498274672,
"eval_loss": 3.10593581199646,
"eval_runtime": 18.6566,
"eval_samples_per_second": 16.348,
"eval_steps_per_second": 4.127,
"step": 1
},
{
"epoch": 0.013802622498274672,
"grad_norm": 24.552059173583984,
"learning_rate": 4.1400000000000003e-05,
"loss": 5.1127,
"step": 10
},
{
"epoch": 0.027605244996549344,
"grad_norm": 15.504034042358398,
"learning_rate": 8.280000000000001e-05,
"loss": 3.841,
"step": 20
},
{
"epoch": 0.041407867494824016,
"grad_norm": 14.46058177947998,
"learning_rate": 0.00012419999999999998,
"loss": 3.534,
"step": 30
},
{
"epoch": 0.05521048999309869,
"grad_norm": 51.250701904296875,
"learning_rate": 0.00016560000000000001,
"loss": 3.9728,
"step": 40
},
{
"epoch": 0.06901311249137336,
"grad_norm": 27.90164566040039,
"learning_rate": 0.000207,
"loss": 3.3681,
"step": 50
},
{
"epoch": 0.06901311249137336,
"eval_loss": 2.3035168647766113,
"eval_runtime": 18.6377,
"eval_samples_per_second": 16.365,
"eval_steps_per_second": 4.131,
"step": 50
},
{
"epoch": 0.08281573498964803,
"grad_norm": 16.63316535949707,
"learning_rate": 0.00020674787920189178,
"loss": 4.045,
"step": 60
},
{
"epoch": 0.0966183574879227,
"grad_norm": 14.355842590332031,
"learning_rate": 0.00020599274511475253,
"loss": 3.2498,
"step": 70
},
{
"epoch": 0.11042097998619738,
"grad_norm": 14.458366394042969,
"learning_rate": 0.00020473827667594888,
"loss": 3.7108,
"step": 80
},
{
"epoch": 0.12422360248447205,
"grad_norm": 16.983057022094727,
"learning_rate": 0.00020299058552961598,
"loss": 3.4946,
"step": 90
},
{
"epoch": 0.13802622498274672,
"grad_norm": 30.41816520690918,
"learning_rate": 0.00020075818625134152,
"loss": 3.7154,
"step": 100
},
{
"epoch": 0.13802622498274672,
"eval_loss": 2.316256523132324,
"eval_runtime": 18.6116,
"eval_samples_per_second": 16.388,
"eval_steps_per_second": 4.137,
"step": 100
},
{
"epoch": 0.1518288474810214,
"grad_norm": 69.45185852050781,
"learning_rate": 0.00019805195486600916,
"loss": 3.5852,
"step": 110
},
{
"epoch": 0.16563146997929606,
"grad_norm": 14.340858459472656,
"learning_rate": 0.00019488507586089894,
"loss": 3.4744,
"step": 120
},
{
"epoch": 0.17943409247757075,
"grad_norm": 28.43404197692871,
"learning_rate": 0.00019127297795219008,
"loss": 3.1552,
"step": 130
},
{
"epoch": 0.1932367149758454,
"grad_norm": 18.572847366333008,
"learning_rate": 0.00018723325891780706,
"loss": 3.3675,
"step": 140
},
{
"epoch": 0.2070393374741201,
"grad_norm": 26.13022804260254,
"learning_rate": 0.0001827855998628142,
"loss": 3.4886,
"step": 150
},
{
"epoch": 0.2070393374741201,
"eval_loss": 1.9903616905212402,
"eval_runtime": 18.6032,
"eval_samples_per_second": 16.395,
"eval_steps_per_second": 4.139,
"step": 150
},
{
"epoch": 0.22084195997239475,
"grad_norm": 17.420490264892578,
"learning_rate": 0.0001779516693350504,
"loss": 3.5297,
"step": 160
},
{
"epoch": 0.23464458247066944,
"grad_norm": 21.1903133392334,
"learning_rate": 0.00017275501775814182,
"loss": 3.1483,
"step": 170
},
{
"epoch": 0.2484472049689441,
"grad_norm": 16.870092391967773,
"learning_rate": 0.00016722096269620562,
"loss": 2.9207,
"step": 180
},
{
"epoch": 0.26224982746721875,
"grad_norm": 19.400983810424805,
"learning_rate": 0.00016137646550922228,
"loss": 4.4511,
"step": 190
},
{
"epoch": 0.27605244996549344,
"grad_norm": 23.654308319091797,
"learning_rate": 0.00015525,
"loss": 2.9133,
"step": 200
},
{
"epoch": 0.27605244996549344,
"eval_loss": 1.8559632301330566,
"eval_runtime": 18.5949,
"eval_samples_per_second": 16.402,
"eval_steps_per_second": 4.141,
"step": 200
},
{
"epoch": 0.2898550724637681,
"grad_norm": 27.395263671875,
"learning_rate": 0.0001488714136926695,
"loss": 3.4052,
"step": 210
},
{
"epoch": 0.3036576949620428,
"grad_norm": 15.65333080291748,
"learning_rate": 0.0001422717824185469,
"loss": 3.1855,
"step": 220
},
{
"epoch": 0.31746031746031744,
"grad_norm": 13.883142471313477,
"learning_rate": 0.00013548325891780705,
"loss": 3.1015,
"step": 230
},
{
"epoch": 0.33126293995859213,
"grad_norm": 20.504344940185547,
"learning_rate": 0.0001285389161945656,
"loss": 3.6529,
"step": 240
},
{
"epoch": 0.3450655624568668,
"grad_norm": 37.432334899902344,
"learning_rate": 0.0001214725863885273,
"loss": 4.7279,
"step": 250
},
{
"epoch": 0.3450655624568668,
"eval_loss": 1.8165959119796753,
"eval_runtime": 18.6944,
"eval_samples_per_second": 16.315,
"eval_steps_per_second": 4.119,
"step": 250
},
{
"epoch": 0.3588681849551415,
"grad_norm": 16.661788940429688,
"learning_rate": 0.00011431869594820213,
"loss": 3.6444,
"step": 260
},
{
"epoch": 0.37267080745341613,
"grad_norm": 21.61427116394043,
"learning_rate": 0.00010711209790870886,
"loss": 2.8124,
"step": 270
},
{
"epoch": 0.3864734299516908,
"grad_norm": 14.587434768676758,
"learning_rate": 9.988790209129117e-05,
"loss": 2.6168,
"step": 280
},
{
"epoch": 0.4002760524499655,
"grad_norm": 10.792880058288574,
"learning_rate": 9.268130405179787e-05,
"loss": 2.8998,
"step": 290
},
{
"epoch": 0.4140786749482402,
"grad_norm": 19.90591049194336,
"learning_rate": 8.55274136114727e-05,
"loss": 2.9482,
"step": 300
},
{
"epoch": 0.4140786749482402,
"eval_loss": 1.5225303173065186,
"eval_runtime": 18.6815,
"eval_samples_per_second": 16.326,
"eval_steps_per_second": 4.122,
"step": 300
},
{
"epoch": 0.4278812974465148,
"grad_norm": 12.331941604614258,
"learning_rate": 7.84610838054344e-05,
"loss": 2.8716,
"step": 310
},
{
"epoch": 0.4416839199447895,
"grad_norm": 12.893694877624512,
"learning_rate": 7.151674108219295e-05,
"loss": 2.9602,
"step": 320
},
{
"epoch": 0.4554865424430642,
"grad_norm": 10.044930458068848,
"learning_rate": 6.472821758145309e-05,
"loss": 2.7228,
"step": 330
},
{
"epoch": 0.4692891649413389,
"grad_norm": 10.728809356689453,
"learning_rate": 5.8128586307330475e-05,
"loss": 2.9921,
"step": 340
},
{
"epoch": 0.4830917874396135,
"grad_norm": 20.39414405822754,
"learning_rate": 5.175000000000002e-05,
"loss": 2.815,
"step": 350
},
{
"epoch": 0.4830917874396135,
"eval_loss": 1.4471914768218994,
"eval_runtime": 18.6607,
"eval_samples_per_second": 16.344,
"eval_steps_per_second": 4.126,
"step": 350
},
{
"epoch": 0.4968944099378882,
"grad_norm": 11.737767219543457,
"learning_rate": 4.5623534490777714e-05,
"loss": 2.9584,
"step": 360
},
{
"epoch": 0.5106970324361628,
"grad_norm": 16.33255958557129,
"learning_rate": 3.9779037303794365e-05,
"loss": 2.6765,
"step": 370
},
{
"epoch": 0.5244996549344375,
"grad_norm": 10.07257080078125,
"learning_rate": 3.42449822418582e-05,
"loss": 2.715,
"step": 380
},
{
"epoch": 0.5383022774327122,
"grad_norm": 20.379079818725586,
"learning_rate": 2.9048330664949622e-05,
"loss": 2.5815,
"step": 390
},
{
"epoch": 0.5521048999309869,
"grad_norm": 24.735910415649414,
"learning_rate": 2.4214400137185785e-05,
"loss": 2.5287,
"step": 400
},
{
"epoch": 0.5521048999309869,
"eval_loss": 1.2898170948028564,
"eval_runtime": 18.6447,
"eval_samples_per_second": 16.359,
"eval_steps_per_second": 4.13,
"step": 400
},
{
"epoch": 0.5659075224292616,
"grad_norm": 10.461315155029297,
"learning_rate": 1.976674108219295e-05,
"loss": 2.38,
"step": 410
},
{
"epoch": 0.5797101449275363,
"grad_norm": 12.781827926635742,
"learning_rate": 1.572702204780991e-05,
"loss": 2.5618,
"step": 420
},
{
"epoch": 0.5935127674258109,
"grad_norm": 12.61215877532959,
"learning_rate": 1.2114924139101056e-05,
"loss": 2.1771,
"step": 430
},
{
"epoch": 0.6073153899240856,
"grad_norm": 12.710418701171875,
"learning_rate": 8.948045133990798e-06,
"loss": 2.2393,
"step": 440
},
{
"epoch": 0.6211180124223602,
"grad_norm": 20.297300338745117,
"learning_rate": 6.241813748658489e-06,
"loss": 2.4048,
"step": 450
},
{
"epoch": 0.6211180124223602,
"eval_loss": 1.241526484489441,
"eval_runtime": 18.7361,
"eval_samples_per_second": 16.279,
"eval_steps_per_second": 4.11,
"step": 450
},
{
"epoch": 0.6349206349206349,
"grad_norm": 13.001998901367188,
"learning_rate": 4.009414470383994e-06,
"loss": 2.4387,
"step": 460
},
{
"epoch": 0.6487232574189096,
"grad_norm": 10.605116844177246,
"learning_rate": 2.261723324051111e-06,
"loss": 2.2201,
"step": 470
},
{
"epoch": 0.6625258799171843,
"grad_norm": 8.887131690979004,
"learning_rate": 1.0072548852474675e-06,
"loss": 1.831,
"step": 480
},
{
"epoch": 0.6763285024154589,
"grad_norm": 15.026272773742676,
"learning_rate": 2.5212079810819554e-07,
"loss": 2.6445,
"step": 490
},
{
"epoch": 0.6901311249137336,
"grad_norm": 23.53835678100586,
"learning_rate": 0.0,
"loss": 2.4292,
"step": 500
},
{
"epoch": 0.6901311249137336,
"eval_loss": 1.2338447570800781,
"eval_runtime": 18.701,
"eval_samples_per_second": 16.309,
"eval_steps_per_second": 4.117,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.9437425893376e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}