|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.009493858660179197, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.373464665044799e-05, |
|
"eval_loss": 1.1060571670532227, |
|
"eval_runtime": 512.4208, |
|
"eval_samples_per_second": 34.62, |
|
"eval_steps_per_second": 17.31, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00011867323325223996, |
|
"grad_norm": 0.06994107365608215, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7434, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00023734646650447992, |
|
"grad_norm": 0.08292974531650543, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.86, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00035601969975671986, |
|
"grad_norm": 0.11479827016592026, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9153, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00047469293300895983, |
|
"grad_norm": 0.13735784590244293, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.945, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005933661662611998, |
|
"grad_norm": 0.15192271769046783, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.0809, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0007120393995134397, |
|
"grad_norm": 0.148358553647995, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0627, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0008307126327656797, |
|
"grad_norm": 0.230050191283226, |
|
"learning_rate": 9.995494831023409e-05, |
|
"loss": 1.1128, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0009493858660179197, |
|
"grad_norm": 0.2764645218849182, |
|
"learning_rate": 9.981987442712633e-05, |
|
"loss": 1.0843, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0010680590992701597, |
|
"grad_norm": 0.2294786274433136, |
|
"learning_rate": 9.959502176294383e-05, |
|
"loss": 1.0233, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0011867323325223996, |
|
"grad_norm": 0.3697080612182617, |
|
"learning_rate": 9.928079551738543e-05, |
|
"loss": 0.9998, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0013054055657746395, |
|
"grad_norm": 0.12134025990962982, |
|
"learning_rate": 9.887776194738432e-05, |
|
"loss": 0.813, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0014240787990268794, |
|
"grad_norm": 0.1504671722650528, |
|
"learning_rate": 9.838664734667495e-05, |
|
"loss": 0.8707, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0015427520322791194, |
|
"grad_norm": 0.12335077673196793, |
|
"learning_rate": 9.780833673696254e-05, |
|
"loss": 0.8697, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0016614252655313595, |
|
"grad_norm": 0.1612686961889267, |
|
"learning_rate": 9.714387227305422e-05, |
|
"loss": 0.8871, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0017800984987835994, |
|
"grad_norm": 0.15226492285728455, |
|
"learning_rate": 9.639445136482548e-05, |
|
"loss": 1.024, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0018987717320358393, |
|
"grad_norm": 0.1592480093240738, |
|
"learning_rate": 9.55614245194068e-05, |
|
"loss": 1.006, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0020174449652880793, |
|
"grad_norm": 0.16184721887111664, |
|
"learning_rate": 9.464629290747842e-05, |
|
"loss": 0.9669, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0021361181985403194, |
|
"grad_norm": 0.20670561492443085, |
|
"learning_rate": 9.365070565805941e-05, |
|
"loss": 0.941, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.002254791431792559, |
|
"grad_norm": 0.2282506376504898, |
|
"learning_rate": 9.257645688666556e-05, |
|
"loss": 0.9345, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0023734646650447992, |
|
"grad_norm": 0.40875211358070374, |
|
"learning_rate": 9.142548246219212e-05, |
|
"loss": 0.9923, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0023734646650447992, |
|
"eval_loss": 0.9161506295204163, |
|
"eval_runtime": 511.6193, |
|
"eval_samples_per_second": 34.674, |
|
"eval_steps_per_second": 17.337, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002492137898297039, |
|
"grad_norm": 0.09706299751996994, |
|
"learning_rate": 9.019985651834703e-05, |
|
"loss": 0.7927, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.002610811131549279, |
|
"grad_norm": 0.12703746557235718, |
|
"learning_rate": 8.890178771592199e-05, |
|
"loss": 0.9126, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.002729484364801519, |
|
"grad_norm": 0.152408629655838, |
|
"learning_rate": 8.753361526263621e-05, |
|
"loss": 0.8603, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.002848157598053759, |
|
"grad_norm": 0.15565912425518036, |
|
"learning_rate": 8.609780469772623e-05, |
|
"loss": 0.863, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.002966830831305999, |
|
"grad_norm": 0.1481216698884964, |
|
"learning_rate": 8.459694344887732e-05, |
|
"loss": 0.8306, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0030855040645582387, |
|
"grad_norm": 0.16268706321716309, |
|
"learning_rate": 8.303373616950408e-05, |
|
"loss": 0.9171, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003204177297810479, |
|
"grad_norm": 0.19657278060913086, |
|
"learning_rate": 8.141099986478212e-05, |
|
"loss": 1.0378, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.003322850531062719, |
|
"grad_norm": 0.21237877011299133, |
|
"learning_rate": 7.973165881521434e-05, |
|
"loss": 0.9739, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0034415237643149587, |
|
"grad_norm": 0.22750777006149292, |
|
"learning_rate": 7.799873930687978e-05, |
|
"loss": 0.89, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.003560196997567199, |
|
"grad_norm": 0.4223545789718628, |
|
"learning_rate": 7.621536417786159e-05, |
|
"loss": 0.8019, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0036788702308194385, |
|
"grad_norm": 0.10099593549966812, |
|
"learning_rate": 7.438474719068173e-05, |
|
"loss": 0.7283, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0037975434640716787, |
|
"grad_norm": 0.15844862163066864, |
|
"learning_rate": 7.251018724088367e-05, |
|
"loss": 0.8589, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.003916216697323918, |
|
"grad_norm": 0.13598279654979706, |
|
"learning_rate": 7.059506241219965e-05, |
|
"loss": 0.8389, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0040348899305761585, |
|
"grad_norm": 0.1379881352186203, |
|
"learning_rate": 6.864282388901544e-05, |
|
"loss": 0.8838, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004153563163828399, |
|
"grad_norm": 0.1695917397737503, |
|
"learning_rate": 6.665698973710288e-05, |
|
"loss": 0.8816, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.004272236397080639, |
|
"grad_norm": 0.1812783181667328, |
|
"learning_rate": 6.464113856382752e-05, |
|
"loss": 0.941, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.004390909630332878, |
|
"grad_norm": 0.19394883513450623, |
|
"learning_rate": 6.259890306925627e-05, |
|
"loss": 0.9778, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.004509582863585118, |
|
"grad_norm": 0.23465299606323242, |
|
"learning_rate": 6.0533963499786314e-05, |
|
"loss": 0.9645, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.004628256096837358, |
|
"grad_norm": 0.25456446409225464, |
|
"learning_rate": 5.8450041016092464e-05, |
|
"loss": 0.9213, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0047469293300895984, |
|
"grad_norm": 0.4076145887374878, |
|
"learning_rate": 5.6350890987343944e-05, |
|
"loss": 0.884, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0047469293300895984, |
|
"eval_loss": 0.897860586643219, |
|
"eval_runtime": 510.3146, |
|
"eval_samples_per_second": 34.763, |
|
"eval_steps_per_second": 17.381, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.004865602563341839, |
|
"grad_norm": 0.1094803586602211, |
|
"learning_rate": 5.4240296223775465e-05, |
|
"loss": 0.7631, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.004984275796594078, |
|
"grad_norm": 0.14633063971996307, |
|
"learning_rate": 5.212206015980742e-05, |
|
"loss": 0.8653, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005102949029846318, |
|
"grad_norm": 0.12594805657863617, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8278, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.005221622263098558, |
|
"grad_norm": 0.1753065288066864, |
|
"learning_rate": 4.78779398401926e-05, |
|
"loss": 0.8959, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.005340295496350798, |
|
"grad_norm": 0.1741386353969574, |
|
"learning_rate": 4.575970377622456e-05, |
|
"loss": 0.9233, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.005458968729603038, |
|
"grad_norm": 0.19290021061897278, |
|
"learning_rate": 4.364910901265606e-05, |
|
"loss": 0.9547, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.005577641962855278, |
|
"grad_norm": 0.22662971913814545, |
|
"learning_rate": 4.1549958983907555e-05, |
|
"loss": 0.9353, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.005696315196107518, |
|
"grad_norm": 0.2653945982456207, |
|
"learning_rate": 3.94660365002137e-05, |
|
"loss": 0.9487, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.005814988429359758, |
|
"grad_norm": 0.2608093321323395, |
|
"learning_rate": 3.740109693074375e-05, |
|
"loss": 0.9119, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.005933661662611998, |
|
"grad_norm": 0.4665874242782593, |
|
"learning_rate": 3.5358861436172485e-05, |
|
"loss": 0.904, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.006052334895864238, |
|
"grad_norm": 0.10215272009372711, |
|
"learning_rate": 3.334301026289712e-05, |
|
"loss": 0.7477, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.0061710081291164775, |
|
"grad_norm": 0.1244097575545311, |
|
"learning_rate": 3.135717611098458e-05, |
|
"loss": 0.7905, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.006289681362368718, |
|
"grad_norm": 0.16128472983837128, |
|
"learning_rate": 2.9404937587800375e-05, |
|
"loss": 0.8343, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.006408354595620958, |
|
"grad_norm": 0.15076673030853271, |
|
"learning_rate": 2.748981275911633e-05, |
|
"loss": 0.7976, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.006527027828873198, |
|
"grad_norm": 0.17758683860301971, |
|
"learning_rate": 2.5615252809318284e-05, |
|
"loss": 0.9116, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.006645701062125438, |
|
"grad_norm": 0.22221983969211578, |
|
"learning_rate": 2.3784635822138424e-05, |
|
"loss": 0.9473, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.006764374295377677, |
|
"grad_norm": 0.2009792923927307, |
|
"learning_rate": 2.2001260693120233e-05, |
|
"loss": 0.9147, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.006883047528629917, |
|
"grad_norm": 0.24482344090938568, |
|
"learning_rate": 2.026834118478567e-05, |
|
"loss": 1.0044, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0070017207618821575, |
|
"grad_norm": 0.28756579756736755, |
|
"learning_rate": 1.858900013521788e-05, |
|
"loss": 1.0234, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.007120393995134398, |
|
"grad_norm": 0.4237484931945801, |
|
"learning_rate": 1.6966263830495936e-05, |
|
"loss": 0.8254, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.007120393995134398, |
|
"eval_loss": 0.890705406665802, |
|
"eval_runtime": 510.6082, |
|
"eval_samples_per_second": 34.743, |
|
"eval_steps_per_second": 17.371, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.007239067228386638, |
|
"grad_norm": 0.09181042015552521, |
|
"learning_rate": 1.5403056551122697e-05, |
|
"loss": 0.7054, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.007357740461638877, |
|
"grad_norm": 0.1410045176744461, |
|
"learning_rate": 1.3902195302273779e-05, |
|
"loss": 0.8037, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.007476413694891117, |
|
"grad_norm": 0.1454688459634781, |
|
"learning_rate": 1.246638473736378e-05, |
|
"loss": 0.8922, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.007595086928143357, |
|
"grad_norm": 0.17593573033809662, |
|
"learning_rate": 1.1098212284078036e-05, |
|
"loss": 0.9334, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0077137601613955975, |
|
"grad_norm": 0.18990731239318848, |
|
"learning_rate": 9.800143481652979e-06, |
|
"loss": 0.9446, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.007832433394647837, |
|
"grad_norm": 0.1836443394422531, |
|
"learning_rate": 8.574517537807897e-06, |
|
"loss": 0.9762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.007951106627900077, |
|
"grad_norm": 0.23809511959552765, |
|
"learning_rate": 7.423543113334436e-06, |
|
"loss": 0.9713, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.008069779861152317, |
|
"grad_norm": 0.23665811121463776, |
|
"learning_rate": 6.349294341940593e-06, |
|
"loss": 0.9209, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.008188453094404557, |
|
"grad_norm": 0.2725285291671753, |
|
"learning_rate": 5.353707092521582e-06, |
|
"loss": 0.9288, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.008307126327656797, |
|
"grad_norm": 0.4677680432796478, |
|
"learning_rate": 4.43857548059321e-06, |
|
"loss": 0.8756, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.008425799560909037, |
|
"grad_norm": 0.10243045538663864, |
|
"learning_rate": 3.605548635174533e-06, |
|
"loss": 0.7671, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.008544472794161278, |
|
"grad_norm": 0.1385096162557602, |
|
"learning_rate": 2.85612772694579e-06, |
|
"loss": 0.7777, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.008663146027413518, |
|
"grad_norm": 0.13208024203777313, |
|
"learning_rate": 2.191663263037458e-06, |
|
"loss": 0.7676, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.008781819260665756, |
|
"grad_norm": 0.16409678757190704, |
|
"learning_rate": 1.6133526533250565e-06, |
|
"loss": 0.8646, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.008900492493917996, |
|
"grad_norm": 0.1728227734565735, |
|
"learning_rate": 1.1222380526156928e-06, |
|
"loss": 0.911, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.009019165727170236, |
|
"grad_norm": 0.19500896334648132, |
|
"learning_rate": 7.192044826145771e-07, |
|
"loss": 0.9795, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.009137838960422476, |
|
"grad_norm": 0.21884216368198395, |
|
"learning_rate": 4.049782370561583e-07, |
|
"loss": 0.8884, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.009256512193674717, |
|
"grad_norm": 0.2533111572265625, |
|
"learning_rate": 1.8012557287367392e-07, |
|
"loss": 0.979, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.009375185426926957, |
|
"grad_norm": 0.27901849150657654, |
|
"learning_rate": 4.5051689765929214e-08, |
|
"loss": 0.9472, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.009493858660179197, |
|
"grad_norm": 0.41356849670410156, |
|
"learning_rate": 0.0, |
|
"loss": 0.7907, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.009493858660179197, |
|
"eval_loss": 0.8899700045585632, |
|
"eval_runtime": 511.1186, |
|
"eval_samples_per_second": 34.708, |
|
"eval_steps_per_second": 17.354, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6483000014929920.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|