PyTorch
bert
MechBERT-uncased / trainer_state.json
pkumar-hf's picture
Public Release
49d3fa2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 78.07620237351655,
"global_step": 125000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.62,
"learning_rate": 7.832e-06,
"loss": 2.2981,
"step": 1000
},
{
"epoch": 1.25,
"learning_rate": 1.5832e-05,
"loss": 1.7902,
"step": 2000
},
{
"epoch": 1.87,
"learning_rate": 2.3832e-05,
"loss": 1.6143,
"step": 3000
},
{
"epoch": 2.5,
"learning_rate": 3.1832e-05,
"loss": 1.5059,
"step": 4000
},
{
"epoch": 3.12,
"learning_rate": 3.9832e-05,
"loss": 1.4265,
"step": 5000
},
{
"epoch": 3.75,
"learning_rate": 4.7824e-05,
"loss": 1.3666,
"step": 6000
},
{
"epoch": 4.37,
"learning_rate": 5.5808000000000005e-05,
"loss": 1.3195,
"step": 7000
},
{
"epoch": 5.0,
"learning_rate": 6.38e-05,
"loss": 1.2827,
"step": 8000
},
{
"epoch": 5.62,
"learning_rate": 7.18e-05,
"loss": 1.2519,
"step": 9000
},
{
"epoch": 6.25,
"learning_rate": 7.9792e-05,
"loss": 1.2251,
"step": 10000
},
{
"epoch": 6.87,
"learning_rate": 7.93224347826087e-05,
"loss": 1.2031,
"step": 11000
},
{
"epoch": 7.5,
"learning_rate": 7.862747826086958e-05,
"loss": 1.1824,
"step": 12000
},
{
"epoch": 8.12,
"learning_rate": 7.793252173913044e-05,
"loss": 1.1646,
"step": 13000
},
{
"epoch": 8.74,
"learning_rate": 7.723756521739132e-05,
"loss": 1.1514,
"step": 14000
},
{
"epoch": 9.37,
"learning_rate": 7.654260869565218e-05,
"loss": 1.1388,
"step": 15000
},
{
"epoch": 9.99,
"learning_rate": 7.584765217391305e-05,
"loss": 1.1281,
"step": 16000
},
{
"epoch": 10.62,
"learning_rate": 7.5152e-05,
"loss": 1.1184,
"step": 17000
},
{
"epoch": 11.24,
"learning_rate": 7.445773913043479e-05,
"loss": 1.11,
"step": 18000
},
{
"epoch": 11.87,
"learning_rate": 7.376208695652175e-05,
"loss": 1.1023,
"step": 19000
},
{
"epoch": 12.49,
"learning_rate": 7.306713043478261e-05,
"loss": 1.0945,
"step": 20000
},
{
"epoch": 13.12,
"learning_rate": 7.237217391304349e-05,
"loss": 1.0871,
"step": 21000
},
{
"epoch": 13.74,
"learning_rate": 7.167791304347826e-05,
"loss": 1.081,
"step": 22000
},
{
"epoch": 14.37,
"learning_rate": 7.098226086956523e-05,
"loss": 1.075,
"step": 23000
},
{
"epoch": 14.99,
"learning_rate": 7.028730434782609e-05,
"loss": 1.0698,
"step": 24000
},
{
"epoch": 15.62,
"learning_rate": 6.959234782608696e-05,
"loss": 1.0647,
"step": 25000
},
{
"epoch": 16.24,
"learning_rate": 6.889739130434783e-05,
"loss": 1.0585,
"step": 26000
},
{
"epoch": 16.86,
"learning_rate": 6.820173913043479e-05,
"loss": 1.0556,
"step": 27000
},
{
"epoch": 17.49,
"learning_rate": 6.750678260869565e-05,
"loss": 1.0504,
"step": 28000
},
{
"epoch": 18.11,
"learning_rate": 6.681182608695653e-05,
"loss": 1.0471,
"step": 29000
},
{
"epoch": 18.74,
"learning_rate": 6.61168695652174e-05,
"loss": 1.0426,
"step": 30000
},
{
"epoch": 19.36,
"learning_rate": 6.542191304347826e-05,
"loss": 1.0392,
"step": 31000
},
{
"epoch": 19.99,
"learning_rate": 6.472626086956522e-05,
"loss": 1.0364,
"step": 32000
},
{
"epoch": 20.61,
"learning_rate": 6.4032e-05,
"loss": 1.0324,
"step": 33000
},
{
"epoch": 21.24,
"learning_rate": 6.333634782608696e-05,
"loss": 1.0302,
"step": 34000
},
{
"epoch": 21.86,
"learning_rate": 6.264139130434782e-05,
"loss": 1.027,
"step": 35000
},
{
"epoch": 22.49,
"learning_rate": 6.19464347826087e-05,
"loss": 1.0234,
"step": 36000
},
{
"epoch": 23.11,
"learning_rate": 6.125147826086957e-05,
"loss": 1.0204,
"step": 37000
},
{
"epoch": 23.74,
"learning_rate": 6.0556521739130436e-05,
"loss": 1.0178,
"step": 38000
},
{
"epoch": 24.36,
"learning_rate": 5.986156521739131e-05,
"loss": 1.0162,
"step": 39000
},
{
"epoch": 24.98,
"learning_rate": 5.9165913043478267e-05,
"loss": 1.0136,
"step": 40000
},
{
"epoch": 25.61,
"learning_rate": 5.847095652173914e-05,
"loss": 1.0112,
"step": 41000
},
{
"epoch": 26.23,
"learning_rate": 5.7776000000000004e-05,
"loss": 1.0089,
"step": 42000
},
{
"epoch": 26.86,
"learning_rate": 5.708104347826087e-05,
"loss": 1.0069,
"step": 43000
},
{
"epoch": 27.48,
"learning_rate": 5.6385391304347834e-05,
"loss": 1.005,
"step": 44000
},
{
"epoch": 28.11,
"learning_rate": 5.56904347826087e-05,
"loss": 1.0026,
"step": 45000
},
{
"epoch": 28.73,
"learning_rate": 5.499478260869565e-05,
"loss": 1.0008,
"step": 46000
},
{
"epoch": 29.36,
"learning_rate": 5.429982608695653e-05,
"loss": 0.9986,
"step": 47000
},
{
"epoch": 29.98,
"learning_rate": 5.3604869565217396e-05,
"loss": 0.9967,
"step": 48000
},
{
"epoch": 30.61,
"learning_rate": 5.2909913043478264e-05,
"loss": 0.9949,
"step": 49000
},
{
"epoch": 31.23,
"learning_rate": 5.2215652173913046e-05,
"loss": 0.9928,
"step": 50000
},
{
"epoch": 31.86,
"learning_rate": 5.1520695652173915e-05,
"loss": 0.992,
"step": 51000
},
{
"epoch": 32.48,
"learning_rate": 5.082504347826088e-05,
"loss": 0.9902,
"step": 52000
},
{
"epoch": 33.1,
"learning_rate": 5.0129391304347825e-05,
"loss": 0.9877,
"step": 53000
},
{
"epoch": 33.73,
"learning_rate": 4.9435130434782614e-05,
"loss": 0.9865,
"step": 54000
},
{
"epoch": 34.35,
"learning_rate": 4.873947826086957e-05,
"loss": 0.9859,
"step": 55000
},
{
"epoch": 34.98,
"learning_rate": 4.804452173913044e-05,
"loss": 0.9839,
"step": 56000
},
{
"epoch": 35.6,
"learning_rate": 4.7349565217391306e-05,
"loss": 0.9825,
"step": 57000
},
{
"epoch": 36.23,
"learning_rate": 4.665460869565218e-05,
"loss": 0.9812,
"step": 58000
},
{
"epoch": 36.85,
"learning_rate": 4.595965217391305e-05,
"loss": 0.9797,
"step": 59000
},
{
"epoch": 37.48,
"learning_rate": 4.526469565217392e-05,
"loss": 0.9784,
"step": 60000
},
{
"epoch": 38.1,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9733,
"step": 61000
},
{
"epoch": 38.73,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9707,
"step": 62000
},
{
"epoch": 39.35,
"learning_rate": 2.4e-05,
"loss": 0.9714,
"step": 63000
},
{
"epoch": 39.98,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9716,
"step": 64000
},
{
"epoch": 40.6,
"learning_rate": 4e-05,
"loss": 0.9716,
"step": 65000
},
{
"epoch": 41.22,
"learning_rate": 4.8e-05,
"loss": 0.9734,
"step": 66000
},
{
"epoch": 41.85,
"learning_rate": 5.6e-05,
"loss": 0.9748,
"step": 67000
},
{
"epoch": 42.47,
"learning_rate": 6.400000000000001e-05,
"loss": 0.9757,
"step": 68000
},
{
"epoch": 43.1,
"learning_rate": 7.2e-05,
"loss": 0.9772,
"step": 69000
},
{
"epoch": 43.72,
"learning_rate": 8e-05,
"loss": 0.977,
"step": 70000
},
{
"epoch": 44.35,
"learning_rate": 7.930434782608697e-05,
"loss": 0.9782,
"step": 71000
},
{
"epoch": 44.97,
"learning_rate": 7.860869565217392e-05,
"loss": 0.977,
"step": 72000
},
{
"epoch": 45.6,
"learning_rate": 7.791304347826088e-05,
"loss": 0.9752,
"step": 73000
},
{
"epoch": 46.22,
"learning_rate": 7.721739130434783e-05,
"loss": 0.9747,
"step": 74000
},
{
"epoch": 46.85,
"learning_rate": 7.652173913043479e-05,
"loss": 0.9728,
"step": 75000
},
{
"epoch": 47.47,
"learning_rate": 7.582608695652174e-05,
"loss": 0.9701,
"step": 76000
},
{
"epoch": 48.09,
"learning_rate": 7.51304347826087e-05,
"loss": 0.9699,
"step": 77000
},
{
"epoch": 48.72,
"learning_rate": 7.443478260869565e-05,
"loss": 0.9681,
"step": 78000
},
{
"epoch": 49.34,
"learning_rate": 7.373913043478261e-05,
"loss": 0.9669,
"step": 79000
},
{
"epoch": 49.97,
"learning_rate": 7.304347826086957e-05,
"loss": 0.966,
"step": 80000
},
{
"epoch": 50.59,
"learning_rate": 7.864000000000001e-06,
"loss": 0.9577,
"step": 81000
},
{
"epoch": 51.22,
"learning_rate": 1.5856e-05,
"loss": 0.9535,
"step": 82000
},
{
"epoch": 51.84,
"learning_rate": 2.3848e-05,
"loss": 0.9518,
"step": 83000
},
{
"epoch": 52.47,
"learning_rate": 3.184000000000001e-05,
"loss": 0.9525,
"step": 84000
},
{
"epoch": 53.09,
"learning_rate": 3.9832e-05,
"loss": 0.9531,
"step": 85000
},
{
"epoch": 53.72,
"learning_rate": 4.7824e-05,
"loss": 0.9537,
"step": 86000
},
{
"epoch": 54.34,
"learning_rate": 5.5816e-05,
"loss": 0.9541,
"step": 87000
},
{
"epoch": 54.97,
"learning_rate": 6.380800000000001e-05,
"loss": 0.9561,
"step": 88000
},
{
"epoch": 55.59,
"learning_rate": 7.1792e-05,
"loss": 0.9562,
"step": 89000
},
{
"epoch": 56.21,
"learning_rate": 7.9792e-05,
"loss": 0.959,
"step": 90000
},
{
"epoch": 56.84,
"learning_rate": 7.932313043478262e-05,
"loss": 0.9589,
"step": 91000
},
{
"epoch": 57.46,
"learning_rate": 7.862817391304348e-05,
"loss": 0.9586,
"step": 92000
},
{
"epoch": 58.09,
"learning_rate": 7.793252173913044e-05,
"loss": 0.958,
"step": 93000
},
{
"epoch": 58.71,
"learning_rate": 7.723756521739132e-05,
"loss": 0.9558,
"step": 94000
},
{
"epoch": 59.34,
"learning_rate": 7.654260869565218e-05,
"loss": 0.9549,
"step": 95000
},
{
"epoch": 59.96,
"learning_rate": 7.584695652173914e-05,
"loss": 0.954,
"step": 96000
},
{
"epoch": 60.59,
"learning_rate": 7.5152e-05,
"loss": 0.9522,
"step": 97000
},
{
"epoch": 61.21,
"learning_rate": 7.445704347826088e-05,
"loss": 0.9517,
"step": 98000
},
{
"epoch": 61.84,
"learning_rate": 7.376208695652175e-05,
"loss": 0.9509,
"step": 99000
},
{
"epoch": 62.46,
"learning_rate": 7.306852173913044e-05,
"loss": 0.9497,
"step": 100000
},
{
"epoch": 63.09,
"learning_rate": 7.23728695652174e-05,
"loss": 0.9483,
"step": 101000
},
{
"epoch": 63.71,
"learning_rate": 7.167721739130435e-05,
"loss": 0.947,
"step": 102000
},
{
"epoch": 64.33,
"learning_rate": 7.098226086956523e-05,
"loss": 0.9463,
"step": 103000
},
{
"epoch": 64.96,
"learning_rate": 7.028730434782609e-05,
"loss": 0.9455,
"step": 104000
},
{
"epoch": 65.58,
"learning_rate": 6.959165217391305e-05,
"loss": 0.9437,
"step": 105000
},
{
"epoch": 66.21,
"learning_rate": 6.889669565217393e-05,
"loss": 0.9437,
"step": 106000
},
{
"epoch": 66.83,
"learning_rate": 6.820173913043479e-05,
"loss": 0.9429,
"step": 107000
},
{
"epoch": 67.46,
"learning_rate": 6.750608695652175e-05,
"loss": 0.9418,
"step": 108000
},
{
"epoch": 68.08,
"learning_rate": 6.681113043478261e-05,
"loss": 0.941,
"step": 109000
},
{
"epoch": 68.71,
"learning_rate": 6.611617391304349e-05,
"loss": 0.9397,
"step": 110000
},
{
"epoch": 69.33,
"learning_rate": 6.542121739130435e-05,
"loss": 0.939,
"step": 111000
},
{
"epoch": 69.96,
"learning_rate": 6.472556521739131e-05,
"loss": 0.9382,
"step": 112000
},
{
"epoch": 70.58,
"learning_rate": 6.403060869565217e-05,
"loss": 0.9375,
"step": 113000
},
{
"epoch": 71.21,
"learning_rate": 6.333634782608696e-05,
"loss": 0.9362,
"step": 114000
},
{
"epoch": 71.83,
"learning_rate": 6.264069565217392e-05,
"loss": 0.9362,
"step": 115000
},
{
"epoch": 72.45,
"learning_rate": 6.194573913043479e-05,
"loss": 0.9348,
"step": 116000
},
{
"epoch": 73.08,
"learning_rate": 6.125078260869566e-05,
"loss": 0.934,
"step": 117000
},
{
"epoch": 73.7,
"learning_rate": 6.055513043478261e-05,
"loss": 0.9325,
"step": 118000
},
{
"epoch": 74.33,
"learning_rate": 5.986017391304348e-05,
"loss": 0.9321,
"step": 119000
},
{
"epoch": 74.95,
"learning_rate": 5.916521739130435e-05,
"loss": 0.9315,
"step": 120000
},
{
"epoch": 75.58,
"learning_rate": 5.847026086956522e-05,
"loss": 0.9299,
"step": 121000
},
{
"epoch": 76.2,
"learning_rate": 5.777530434782609e-05,
"loss": 0.9291,
"step": 122000
},
{
"epoch": 76.83,
"learning_rate": 5.708034782608696e-05,
"loss": 0.9291,
"step": 123000
},
{
"epoch": 77.45,
"learning_rate": 5.6385391304347834e-05,
"loss": 0.9277,
"step": 124000
},
{
"epoch": 78.08,
"learning_rate": 5.56904347826087e-05,
"loss": 0.9278,
"step": 125000
},
{
"epoch": 78.08,
"step": 125000,
"total_flos": 6.73699192934362e+19,
"train_loss": 0.18748452099609375,
"train_runtime": 7317.8985,
"train_samples_per_second": 34982.721,
"train_steps_per_second": 17.081
}
],
"max_steps": 125000,
"num_train_epochs": 79,
"total_flos": 6.73699192934362e+19,
"trial_name": null,
"trial_params": null
}