prxy5605's picture
Training in progress, epoch 0, checkpoint
e53459c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.009493858660179197,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.373464665044799e-05,
"eval_loss": 1.1060571670532227,
"eval_runtime": 512.4208,
"eval_samples_per_second": 34.62,
"eval_steps_per_second": 17.31,
"step": 1
},
{
"epoch": 0.00011867323325223996,
"grad_norm": 0.06994107365608215,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7434,
"step": 5
},
{
"epoch": 0.00023734646650447992,
"grad_norm": 0.08292974531650543,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.86,
"step": 10
},
{
"epoch": 0.00035601969975671986,
"grad_norm": 0.11479827016592026,
"learning_rate": 5e-05,
"loss": 0.9153,
"step": 15
},
{
"epoch": 0.00047469293300895983,
"grad_norm": 0.13735784590244293,
"learning_rate": 6.666666666666667e-05,
"loss": 0.945,
"step": 20
},
{
"epoch": 0.0005933661662611998,
"grad_norm": 0.15192271769046783,
"learning_rate": 8.333333333333334e-05,
"loss": 1.0809,
"step": 25
},
{
"epoch": 0.0007120393995134397,
"grad_norm": 0.148358553647995,
"learning_rate": 0.0001,
"loss": 1.0627,
"step": 30
},
{
"epoch": 0.0008307126327656797,
"grad_norm": 0.230050191283226,
"learning_rate": 9.995494831023409e-05,
"loss": 1.1128,
"step": 35
},
{
"epoch": 0.0009493858660179197,
"grad_norm": 0.2764645218849182,
"learning_rate": 9.981987442712633e-05,
"loss": 1.0843,
"step": 40
},
{
"epoch": 0.0010680590992701597,
"grad_norm": 0.2294786274433136,
"learning_rate": 9.959502176294383e-05,
"loss": 1.0233,
"step": 45
},
{
"epoch": 0.0011867323325223996,
"grad_norm": 0.3697080612182617,
"learning_rate": 9.928079551738543e-05,
"loss": 0.9998,
"step": 50
},
{
"epoch": 0.0013054055657746395,
"grad_norm": 0.12134025990962982,
"learning_rate": 9.887776194738432e-05,
"loss": 0.813,
"step": 55
},
{
"epoch": 0.0014240787990268794,
"grad_norm": 0.1504671722650528,
"learning_rate": 9.838664734667495e-05,
"loss": 0.8707,
"step": 60
},
{
"epoch": 0.0015427520322791194,
"grad_norm": 0.12335077673196793,
"learning_rate": 9.780833673696254e-05,
"loss": 0.8697,
"step": 65
},
{
"epoch": 0.0016614252655313595,
"grad_norm": 0.1612686961889267,
"learning_rate": 9.714387227305422e-05,
"loss": 0.8871,
"step": 70
},
{
"epoch": 0.0017800984987835994,
"grad_norm": 0.15226492285728455,
"learning_rate": 9.639445136482548e-05,
"loss": 1.024,
"step": 75
},
{
"epoch": 0.0018987717320358393,
"grad_norm": 0.1592480093240738,
"learning_rate": 9.55614245194068e-05,
"loss": 1.006,
"step": 80
},
{
"epoch": 0.0020174449652880793,
"grad_norm": 0.16184721887111664,
"learning_rate": 9.464629290747842e-05,
"loss": 0.9669,
"step": 85
},
{
"epoch": 0.0021361181985403194,
"grad_norm": 0.20670561492443085,
"learning_rate": 9.365070565805941e-05,
"loss": 0.941,
"step": 90
},
{
"epoch": 0.002254791431792559,
"grad_norm": 0.2282506376504898,
"learning_rate": 9.257645688666556e-05,
"loss": 0.9345,
"step": 95
},
{
"epoch": 0.0023734646650447992,
"grad_norm": 0.40875211358070374,
"learning_rate": 9.142548246219212e-05,
"loss": 0.9923,
"step": 100
},
{
"epoch": 0.0023734646650447992,
"eval_loss": 0.9161506295204163,
"eval_runtime": 511.6193,
"eval_samples_per_second": 34.674,
"eval_steps_per_second": 17.337,
"step": 100
},
{
"epoch": 0.002492137898297039,
"grad_norm": 0.09706299751996994,
"learning_rate": 9.019985651834703e-05,
"loss": 0.7927,
"step": 105
},
{
"epoch": 0.002610811131549279,
"grad_norm": 0.12703746557235718,
"learning_rate": 8.890178771592199e-05,
"loss": 0.9126,
"step": 110
},
{
"epoch": 0.002729484364801519,
"grad_norm": 0.152408629655838,
"learning_rate": 8.753361526263621e-05,
"loss": 0.8603,
"step": 115
},
{
"epoch": 0.002848157598053759,
"grad_norm": 0.15565912425518036,
"learning_rate": 8.609780469772623e-05,
"loss": 0.863,
"step": 120
},
{
"epoch": 0.002966830831305999,
"grad_norm": 0.1481216698884964,
"learning_rate": 8.459694344887732e-05,
"loss": 0.8306,
"step": 125
},
{
"epoch": 0.0030855040645582387,
"grad_norm": 0.16268706321716309,
"learning_rate": 8.303373616950408e-05,
"loss": 0.9171,
"step": 130
},
{
"epoch": 0.003204177297810479,
"grad_norm": 0.19657278060913086,
"learning_rate": 8.141099986478212e-05,
"loss": 1.0378,
"step": 135
},
{
"epoch": 0.003322850531062719,
"grad_norm": 0.21237877011299133,
"learning_rate": 7.973165881521434e-05,
"loss": 0.9739,
"step": 140
},
{
"epoch": 0.0034415237643149587,
"grad_norm": 0.22750777006149292,
"learning_rate": 7.799873930687978e-05,
"loss": 0.89,
"step": 145
},
{
"epoch": 0.003560196997567199,
"grad_norm": 0.4223545789718628,
"learning_rate": 7.621536417786159e-05,
"loss": 0.8019,
"step": 150
},
{
"epoch": 0.0036788702308194385,
"grad_norm": 0.10099593549966812,
"learning_rate": 7.438474719068173e-05,
"loss": 0.7283,
"step": 155
},
{
"epoch": 0.0037975434640716787,
"grad_norm": 0.15844862163066864,
"learning_rate": 7.251018724088367e-05,
"loss": 0.8589,
"step": 160
},
{
"epoch": 0.003916216697323918,
"grad_norm": 0.13598279654979706,
"learning_rate": 7.059506241219965e-05,
"loss": 0.8389,
"step": 165
},
{
"epoch": 0.0040348899305761585,
"grad_norm": 0.1379881352186203,
"learning_rate": 6.864282388901544e-05,
"loss": 0.8838,
"step": 170
},
{
"epoch": 0.004153563163828399,
"grad_norm": 0.1695917397737503,
"learning_rate": 6.665698973710288e-05,
"loss": 0.8816,
"step": 175
},
{
"epoch": 0.004272236397080639,
"grad_norm": 0.1812783181667328,
"learning_rate": 6.464113856382752e-05,
"loss": 0.941,
"step": 180
},
{
"epoch": 0.004390909630332878,
"grad_norm": 0.19394883513450623,
"learning_rate": 6.259890306925627e-05,
"loss": 0.9778,
"step": 185
},
{
"epoch": 0.004509582863585118,
"grad_norm": 0.23465299606323242,
"learning_rate": 6.0533963499786314e-05,
"loss": 0.9645,
"step": 190
},
{
"epoch": 0.004628256096837358,
"grad_norm": 0.25456446409225464,
"learning_rate": 5.8450041016092464e-05,
"loss": 0.9213,
"step": 195
},
{
"epoch": 0.0047469293300895984,
"grad_norm": 0.4076145887374878,
"learning_rate": 5.6350890987343944e-05,
"loss": 0.884,
"step": 200
},
{
"epoch": 0.0047469293300895984,
"eval_loss": 0.897860586643219,
"eval_runtime": 510.3146,
"eval_samples_per_second": 34.763,
"eval_steps_per_second": 17.381,
"step": 200
},
{
"epoch": 0.004865602563341839,
"grad_norm": 0.1094803586602211,
"learning_rate": 5.4240296223775465e-05,
"loss": 0.7631,
"step": 205
},
{
"epoch": 0.004984275796594078,
"grad_norm": 0.14633063971996307,
"learning_rate": 5.212206015980742e-05,
"loss": 0.8653,
"step": 210
},
{
"epoch": 0.005102949029846318,
"grad_norm": 0.12594805657863617,
"learning_rate": 5e-05,
"loss": 0.8278,
"step": 215
},
{
"epoch": 0.005221622263098558,
"grad_norm": 0.1753065288066864,
"learning_rate": 4.78779398401926e-05,
"loss": 0.8959,
"step": 220
},
{
"epoch": 0.005340295496350798,
"grad_norm": 0.1741386353969574,
"learning_rate": 4.575970377622456e-05,
"loss": 0.9233,
"step": 225
},
{
"epoch": 0.005458968729603038,
"grad_norm": 0.19290021061897278,
"learning_rate": 4.364910901265606e-05,
"loss": 0.9547,
"step": 230
},
{
"epoch": 0.005577641962855278,
"grad_norm": 0.22662971913814545,
"learning_rate": 4.1549958983907555e-05,
"loss": 0.9353,
"step": 235
},
{
"epoch": 0.005696315196107518,
"grad_norm": 0.2653945982456207,
"learning_rate": 3.94660365002137e-05,
"loss": 0.9487,
"step": 240
},
{
"epoch": 0.005814988429359758,
"grad_norm": 0.2608093321323395,
"learning_rate": 3.740109693074375e-05,
"loss": 0.9119,
"step": 245
},
{
"epoch": 0.005933661662611998,
"grad_norm": 0.4665874242782593,
"learning_rate": 3.5358861436172485e-05,
"loss": 0.904,
"step": 250
},
{
"epoch": 0.006052334895864238,
"grad_norm": 0.10215272009372711,
"learning_rate": 3.334301026289712e-05,
"loss": 0.7477,
"step": 255
},
{
"epoch": 0.0061710081291164775,
"grad_norm": 0.1244097575545311,
"learning_rate": 3.135717611098458e-05,
"loss": 0.7905,
"step": 260
},
{
"epoch": 0.006289681362368718,
"grad_norm": 0.16128472983837128,
"learning_rate": 2.9404937587800375e-05,
"loss": 0.8343,
"step": 265
},
{
"epoch": 0.006408354595620958,
"grad_norm": 0.15076673030853271,
"learning_rate": 2.748981275911633e-05,
"loss": 0.7976,
"step": 270
},
{
"epoch": 0.006527027828873198,
"grad_norm": 0.17758683860301971,
"learning_rate": 2.5615252809318284e-05,
"loss": 0.9116,
"step": 275
},
{
"epoch": 0.006645701062125438,
"grad_norm": 0.22221983969211578,
"learning_rate": 2.3784635822138424e-05,
"loss": 0.9473,
"step": 280
},
{
"epoch": 0.006764374295377677,
"grad_norm": 0.2009792923927307,
"learning_rate": 2.2001260693120233e-05,
"loss": 0.9147,
"step": 285
},
{
"epoch": 0.006883047528629917,
"grad_norm": 0.24482344090938568,
"learning_rate": 2.026834118478567e-05,
"loss": 1.0044,
"step": 290
},
{
"epoch": 0.0070017207618821575,
"grad_norm": 0.28756579756736755,
"learning_rate": 1.858900013521788e-05,
"loss": 1.0234,
"step": 295
},
{
"epoch": 0.007120393995134398,
"grad_norm": 0.4237484931945801,
"learning_rate": 1.6966263830495936e-05,
"loss": 0.8254,
"step": 300
},
{
"epoch": 0.007120393995134398,
"eval_loss": 0.890705406665802,
"eval_runtime": 510.6082,
"eval_samples_per_second": 34.743,
"eval_steps_per_second": 17.371,
"step": 300
},
{
"epoch": 0.007239067228386638,
"grad_norm": 0.09181042015552521,
"learning_rate": 1.5403056551122697e-05,
"loss": 0.7054,
"step": 305
},
{
"epoch": 0.007357740461638877,
"grad_norm": 0.1410045176744461,
"learning_rate": 1.3902195302273779e-05,
"loss": 0.8037,
"step": 310
},
{
"epoch": 0.007476413694891117,
"grad_norm": 0.1454688459634781,
"learning_rate": 1.246638473736378e-05,
"loss": 0.8922,
"step": 315
},
{
"epoch": 0.007595086928143357,
"grad_norm": 0.17593573033809662,
"learning_rate": 1.1098212284078036e-05,
"loss": 0.9334,
"step": 320
},
{
"epoch": 0.0077137601613955975,
"grad_norm": 0.18990731239318848,
"learning_rate": 9.800143481652979e-06,
"loss": 0.9446,
"step": 325
},
{
"epoch": 0.007832433394647837,
"grad_norm": 0.1836443394422531,
"learning_rate": 8.574517537807897e-06,
"loss": 0.9762,
"step": 330
},
{
"epoch": 0.007951106627900077,
"grad_norm": 0.23809511959552765,
"learning_rate": 7.423543113334436e-06,
"loss": 0.9713,
"step": 335
},
{
"epoch": 0.008069779861152317,
"grad_norm": 0.23665811121463776,
"learning_rate": 6.349294341940593e-06,
"loss": 0.9209,
"step": 340
},
{
"epoch": 0.008188453094404557,
"grad_norm": 0.2725285291671753,
"learning_rate": 5.353707092521582e-06,
"loss": 0.9288,
"step": 345
},
{
"epoch": 0.008307126327656797,
"grad_norm": 0.4677680432796478,
"learning_rate": 4.43857548059321e-06,
"loss": 0.8756,
"step": 350
},
{
"epoch": 0.008425799560909037,
"grad_norm": 0.10243045538663864,
"learning_rate": 3.605548635174533e-06,
"loss": 0.7671,
"step": 355
},
{
"epoch": 0.008544472794161278,
"grad_norm": 0.1385096162557602,
"learning_rate": 2.85612772694579e-06,
"loss": 0.7777,
"step": 360
},
{
"epoch": 0.008663146027413518,
"grad_norm": 0.13208024203777313,
"learning_rate": 2.191663263037458e-06,
"loss": 0.7676,
"step": 365
},
{
"epoch": 0.008781819260665756,
"grad_norm": 0.16409678757190704,
"learning_rate": 1.6133526533250565e-06,
"loss": 0.8646,
"step": 370
},
{
"epoch": 0.008900492493917996,
"grad_norm": 0.1728227734565735,
"learning_rate": 1.1222380526156928e-06,
"loss": 0.911,
"step": 375
},
{
"epoch": 0.009019165727170236,
"grad_norm": 0.19500896334648132,
"learning_rate": 7.192044826145771e-07,
"loss": 0.9795,
"step": 380
},
{
"epoch": 0.009137838960422476,
"grad_norm": 0.21884216368198395,
"learning_rate": 4.049782370561583e-07,
"loss": 0.8884,
"step": 385
},
{
"epoch": 0.009256512193674717,
"grad_norm": 0.2533111572265625,
"learning_rate": 1.8012557287367392e-07,
"loss": 0.979,
"step": 390
},
{
"epoch": 0.009375185426926957,
"grad_norm": 0.27901849150657654,
"learning_rate": 4.5051689765929214e-08,
"loss": 0.9472,
"step": 395
},
{
"epoch": 0.009493858660179197,
"grad_norm": 0.41356849670410156,
"learning_rate": 0.0,
"loss": 0.7907,
"step": 400
},
{
"epoch": 0.009493858660179197,
"eval_loss": 0.8899700045585632,
"eval_runtime": 511.1186,
"eval_samples_per_second": 34.708,
"eval_steps_per_second": 17.354,
"step": 400
}
],
"logging_steps": 5,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6483000014929920.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}