lesso07's picture
Training in progress, step 400, checkpoint
af78eb4 verified
raw
history blame
9.94 kB
{
"best_metric": 2.037623643875122,
"best_model_checkpoint": "miner_id_24/checkpoint-400",
"epoch": 0.028237619568670363,
"eval_steps": 50,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.05940489216759e-05,
"eval_loss": 2.8033833503723145,
"eval_runtime": 108.3643,
"eval_samples_per_second": 55.046,
"eval_steps_per_second": 13.768,
"step": 1
},
{
"epoch": 0.0007059404892167591,
"grad_norm": 1.7475390434265137,
"learning_rate": 4.1400000000000003e-05,
"loss": 2.648,
"step": 10
},
{
"epoch": 0.0014118809784335181,
"grad_norm": 1.68655526638031,
"learning_rate": 8.280000000000001e-05,
"loss": 2.4858,
"step": 20
},
{
"epoch": 0.002117821467650277,
"grad_norm": 1.582150936126709,
"learning_rate": 0.00012419999999999998,
"loss": 2.4498,
"step": 30
},
{
"epoch": 0.0028237619568670363,
"grad_norm": 1.5243428945541382,
"learning_rate": 0.00016560000000000001,
"loss": 2.4085,
"step": 40
},
{
"epoch": 0.0035297024460837953,
"grad_norm": 1.5775226354599,
"learning_rate": 0.000207,
"loss": 2.3538,
"step": 50
},
{
"epoch": 0.0035297024460837953,
"eval_loss": 2.3329827785491943,
"eval_runtime": 108.5486,
"eval_samples_per_second": 54.952,
"eval_steps_per_second": 13.745,
"step": 50
},
{
"epoch": 0.004235642935300554,
"grad_norm": 1.0230377912521362,
"learning_rate": 0.00020674787920189178,
"loss": 2.4597,
"step": 60
},
{
"epoch": 0.004941583424517313,
"grad_norm": 1.1582502126693726,
"learning_rate": 0.00020599274511475253,
"loss": 2.2224,
"step": 70
},
{
"epoch": 0.0056475239137340726,
"grad_norm": 1.2100154161453247,
"learning_rate": 0.00020473827667594888,
"loss": 2.1416,
"step": 80
},
{
"epoch": 0.006353464402950831,
"grad_norm": 1.165224552154541,
"learning_rate": 0.00020299058552961598,
"loss": 2.2391,
"step": 90
},
{
"epoch": 0.007059404892167591,
"grad_norm": 1.492423176765442,
"learning_rate": 0.00020075818625134152,
"loss": 2.308,
"step": 100
},
{
"epoch": 0.007059404892167591,
"eval_loss": 2.240849018096924,
"eval_runtime": 108.306,
"eval_samples_per_second": 55.075,
"eval_steps_per_second": 13.776,
"step": 100
},
{
"epoch": 0.007765345381384349,
"grad_norm": 0.9571946263313293,
"learning_rate": 0.00019805195486600916,
"loss": 2.3412,
"step": 110
},
{
"epoch": 0.008471285870601109,
"grad_norm": 1.0861552953720093,
"learning_rate": 0.00019488507586089894,
"loss": 2.1932,
"step": 120
},
{
"epoch": 0.009177226359817867,
"grad_norm": 1.318565011024475,
"learning_rate": 0.00019127297795219008,
"loss": 2.199,
"step": 130
},
{
"epoch": 0.009883166849034626,
"grad_norm": 1.283436894416809,
"learning_rate": 0.00018723325891780706,
"loss": 2.2015,
"step": 140
},
{
"epoch": 0.010589107338251386,
"grad_norm": 1.3785208463668823,
"learning_rate": 0.0001827855998628142,
"loss": 2.1604,
"step": 150
},
{
"epoch": 0.010589107338251386,
"eval_loss": 2.1907973289489746,
"eval_runtime": 108.2635,
"eval_samples_per_second": 55.097,
"eval_steps_per_second": 13.781,
"step": 150
},
{
"epoch": 0.011295047827468145,
"grad_norm": 0.9574385285377502,
"learning_rate": 0.0001779516693350504,
"loss": 2.1949,
"step": 160
},
{
"epoch": 0.012000988316684903,
"grad_norm": 1.022146463394165,
"learning_rate": 0.00017275501775814182,
"loss": 2.0991,
"step": 170
},
{
"epoch": 0.012706928805901662,
"grad_norm": 1.1318084001541138,
"learning_rate": 0.00016722096269620562,
"loss": 2.1822,
"step": 180
},
{
"epoch": 0.013412869295118422,
"grad_norm": 1.1545848846435547,
"learning_rate": 0.00016137646550922228,
"loss": 2.1998,
"step": 190
},
{
"epoch": 0.014118809784335181,
"grad_norm": 1.444872260093689,
"learning_rate": 0.00015525,
"loss": 2.163,
"step": 200
},
{
"epoch": 0.014118809784335181,
"eval_loss": 2.1361401081085205,
"eval_runtime": 115.7055,
"eval_samples_per_second": 51.553,
"eval_steps_per_second": 12.895,
"step": 200
},
{
"epoch": 0.01482475027355194,
"grad_norm": 0.986111581325531,
"learning_rate": 0.0001488714136926695,
"loss": 2.2117,
"step": 210
},
{
"epoch": 0.015530690762768699,
"grad_norm": 1.0047094821929932,
"learning_rate": 0.0001422717824185469,
"loss": 2.1779,
"step": 220
},
{
"epoch": 0.016236631251985458,
"grad_norm": 1.1812105178833008,
"learning_rate": 0.00013548325891780705,
"loss": 2.1505,
"step": 230
},
{
"epoch": 0.016942571741202218,
"grad_norm": 1.5376032590866089,
"learning_rate": 0.0001285389161945656,
"loss": 2.0143,
"step": 240
},
{
"epoch": 0.017648512230418977,
"grad_norm": 1.3409324884414673,
"learning_rate": 0.0001214725863885273,
"loss": 2.1464,
"step": 250
},
{
"epoch": 0.017648512230418977,
"eval_loss": 2.1107239723205566,
"eval_runtime": 108.6879,
"eval_samples_per_second": 54.882,
"eval_steps_per_second": 13.727,
"step": 250
},
{
"epoch": 0.018354452719635733,
"grad_norm": 0.8746324777603149,
"learning_rate": 0.00011431869594820213,
"loss": 2.2634,
"step": 260
},
{
"epoch": 0.019060393208852493,
"grad_norm": 1.0597100257873535,
"learning_rate": 0.00010711209790870886,
"loss": 2.1859,
"step": 270
},
{
"epoch": 0.019766333698069252,
"grad_norm": 0.9971674084663391,
"learning_rate": 9.988790209129117e-05,
"loss": 2.0446,
"step": 280
},
{
"epoch": 0.02047227418728601,
"grad_norm": 1.1347280740737915,
"learning_rate": 9.268130405179787e-05,
"loss": 2.0384,
"step": 290
},
{
"epoch": 0.02117821467650277,
"grad_norm": 1.377494215965271,
"learning_rate": 8.55274136114727e-05,
"loss": 2.0872,
"step": 300
},
{
"epoch": 0.02117821467650277,
"eval_loss": 2.081465721130371,
"eval_runtime": 115.6558,
"eval_samples_per_second": 51.575,
"eval_steps_per_second": 12.9,
"step": 300
},
{
"epoch": 0.02188415516571953,
"grad_norm": 0.9409071207046509,
"learning_rate": 7.84610838054344e-05,
"loss": 2.1326,
"step": 310
},
{
"epoch": 0.02259009565493629,
"grad_norm": 0.9766196608543396,
"learning_rate": 7.151674108219295e-05,
"loss": 2.1009,
"step": 320
},
{
"epoch": 0.023296036144153046,
"grad_norm": 1.045487880706787,
"learning_rate": 6.472821758145309e-05,
"loss": 1.9352,
"step": 330
},
{
"epoch": 0.024001976633369806,
"grad_norm": 1.2392162084579468,
"learning_rate": 5.8128586307330475e-05,
"loss": 2.0905,
"step": 340
},
{
"epoch": 0.024707917122586565,
"grad_norm": 1.3943520784378052,
"learning_rate": 5.175000000000002e-05,
"loss": 2.1226,
"step": 350
},
{
"epoch": 0.024707917122586565,
"eval_loss": 2.057299852371216,
"eval_runtime": 109.4826,
"eval_samples_per_second": 54.484,
"eval_steps_per_second": 13.628,
"step": 350
},
{
"epoch": 0.025413857611803325,
"grad_norm": 0.9228034615516663,
"learning_rate": 4.5623534490777714e-05,
"loss": 2.1183,
"step": 360
},
{
"epoch": 0.026119798101020084,
"grad_norm": 0.9374071955680847,
"learning_rate": 3.9779037303794365e-05,
"loss": 2.0988,
"step": 370
},
{
"epoch": 0.026825738590236844,
"grad_norm": 1.051098108291626,
"learning_rate": 3.42449822418582e-05,
"loss": 1.9813,
"step": 380
},
{
"epoch": 0.027531679079453603,
"grad_norm": 1.083287000656128,
"learning_rate": 2.9048330664949622e-05,
"loss": 2.1039,
"step": 390
},
{
"epoch": 0.028237619568670363,
"grad_norm": 1.2849839925765991,
"learning_rate": 2.4214400137185785e-05,
"loss": 1.9843,
"step": 400
},
{
"epoch": 0.028237619568670363,
"eval_loss": 2.037623643875122,
"eval_runtime": 107.9784,
"eval_samples_per_second": 55.243,
"eval_steps_per_second": 13.818,
"step": 400
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1105701006934016e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}