{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009493858660179197, "eval_steps": 100, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.373464665044799e-05, "eval_loss": 1.1060571670532227, "eval_runtime": 512.4208, "eval_samples_per_second": 34.62, "eval_steps_per_second": 17.31, "step": 1 }, { "epoch": 0.00011867323325223996, "grad_norm": 0.06994107365608215, "learning_rate": 1.6666666666666667e-05, "loss": 0.7434, "step": 5 }, { "epoch": 0.00023734646650447992, "grad_norm": 0.08292974531650543, "learning_rate": 3.3333333333333335e-05, "loss": 0.86, "step": 10 }, { "epoch": 0.00035601969975671986, "grad_norm": 0.11479827016592026, "learning_rate": 5e-05, "loss": 0.9153, "step": 15 }, { "epoch": 0.00047469293300895983, "grad_norm": 0.13735784590244293, "learning_rate": 6.666666666666667e-05, "loss": 0.945, "step": 20 }, { "epoch": 0.0005933661662611998, "grad_norm": 0.15192271769046783, "learning_rate": 8.333333333333334e-05, "loss": 1.0809, "step": 25 }, { "epoch": 0.0007120393995134397, "grad_norm": 0.148358553647995, "learning_rate": 0.0001, "loss": 1.0627, "step": 30 }, { "epoch": 0.0008307126327656797, "grad_norm": 0.230050191283226, "learning_rate": 9.995494831023409e-05, "loss": 1.1128, "step": 35 }, { "epoch": 0.0009493858660179197, "grad_norm": 0.2764645218849182, "learning_rate": 9.981987442712633e-05, "loss": 1.0843, "step": 40 }, { "epoch": 0.0010680590992701597, "grad_norm": 0.2294786274433136, "learning_rate": 9.959502176294383e-05, "loss": 1.0233, "step": 45 }, { "epoch": 0.0011867323325223996, "grad_norm": 0.3697080612182617, "learning_rate": 9.928079551738543e-05, "loss": 0.9998, "step": 50 }, { "epoch": 0.0013054055657746395, "grad_norm": 0.12134025990962982, "learning_rate": 9.887776194738432e-05, "loss": 0.813, "step": 55 }, { "epoch": 0.0014240787990268794, "grad_norm": 0.1504671722650528, "learning_rate": 9.838664734667495e-05, "loss": 0.8707, "step": 60 }, { "epoch": 0.0015427520322791194, "grad_norm": 0.12335077673196793, "learning_rate": 9.780833673696254e-05, "loss": 0.8697, "step": 65 }, { "epoch": 0.0016614252655313595, "grad_norm": 0.1612686961889267, "learning_rate": 9.714387227305422e-05, "loss": 0.8871, "step": 70 }, { "epoch": 0.0017800984987835994, "grad_norm": 0.15226492285728455, "learning_rate": 9.639445136482548e-05, "loss": 1.024, "step": 75 }, { "epoch": 0.0018987717320358393, "grad_norm": 0.1592480093240738, "learning_rate": 9.55614245194068e-05, "loss": 1.006, "step": 80 }, { "epoch": 0.0020174449652880793, "grad_norm": 0.16184721887111664, "learning_rate": 9.464629290747842e-05, "loss": 0.9669, "step": 85 }, { "epoch": 0.0021361181985403194, "grad_norm": 0.20670561492443085, "learning_rate": 9.365070565805941e-05, "loss": 0.941, "step": 90 }, { "epoch": 0.002254791431792559, "grad_norm": 0.2282506376504898, "learning_rate": 9.257645688666556e-05, "loss": 0.9345, "step": 95 }, { "epoch": 0.0023734646650447992, "grad_norm": 0.40875211358070374, "learning_rate": 9.142548246219212e-05, "loss": 0.9923, "step": 100 }, { "epoch": 0.0023734646650447992, "eval_loss": 0.9161506295204163, "eval_runtime": 511.6193, "eval_samples_per_second": 34.674, "eval_steps_per_second": 17.337, "step": 100 }, { "epoch": 0.002492137898297039, "grad_norm": 0.09706299751996994, "learning_rate": 9.019985651834703e-05, "loss": 0.7927, "step": 105 }, { "epoch": 0.002610811131549279, "grad_norm": 0.12703746557235718, "learning_rate": 8.890178771592199e-05, "loss": 0.9126, "step": 110 }, { "epoch": 0.002729484364801519, "grad_norm": 0.152408629655838, "learning_rate": 8.753361526263621e-05, "loss": 0.8603, "step": 115 }, { "epoch": 0.002848157598053759, "grad_norm": 0.15565912425518036, "learning_rate": 8.609780469772623e-05, "loss": 0.863, "step": 120 }, { "epoch": 0.002966830831305999, "grad_norm": 0.1481216698884964, "learning_rate": 8.459694344887732e-05, "loss": 0.8306, "step": 125 }, { "epoch": 0.0030855040645582387, "grad_norm": 0.16268706321716309, "learning_rate": 8.303373616950408e-05, "loss": 0.9171, "step": 130 }, { "epoch": 0.003204177297810479, "grad_norm": 0.19657278060913086, "learning_rate": 8.141099986478212e-05, "loss": 1.0378, "step": 135 }, { "epoch": 0.003322850531062719, "grad_norm": 0.21237877011299133, "learning_rate": 7.973165881521434e-05, "loss": 0.9739, "step": 140 }, { "epoch": 0.0034415237643149587, "grad_norm": 0.22750777006149292, "learning_rate": 7.799873930687978e-05, "loss": 0.89, "step": 145 }, { "epoch": 0.003560196997567199, "grad_norm": 0.4223545789718628, "learning_rate": 7.621536417786159e-05, "loss": 0.8019, "step": 150 }, { "epoch": 0.0036788702308194385, "grad_norm": 0.10099593549966812, "learning_rate": 7.438474719068173e-05, "loss": 0.7283, "step": 155 }, { "epoch": 0.0037975434640716787, "grad_norm": 0.15844862163066864, "learning_rate": 7.251018724088367e-05, "loss": 0.8589, "step": 160 }, { "epoch": 0.003916216697323918, "grad_norm": 0.13598279654979706, "learning_rate": 7.059506241219965e-05, "loss": 0.8389, "step": 165 }, { "epoch": 0.0040348899305761585, "grad_norm": 0.1379881352186203, "learning_rate": 6.864282388901544e-05, "loss": 0.8838, "step": 170 }, { "epoch": 0.004153563163828399, "grad_norm": 0.1695917397737503, "learning_rate": 6.665698973710288e-05, "loss": 0.8816, "step": 175 }, { "epoch": 0.004272236397080639, "grad_norm": 0.1812783181667328, "learning_rate": 6.464113856382752e-05, "loss": 0.941, "step": 180 }, { "epoch": 0.004390909630332878, "grad_norm": 0.19394883513450623, "learning_rate": 6.259890306925627e-05, "loss": 0.9778, "step": 185 }, { "epoch": 0.004509582863585118, "grad_norm": 0.23465299606323242, "learning_rate": 6.0533963499786314e-05, "loss": 0.9645, "step": 190 }, { "epoch": 0.004628256096837358, "grad_norm": 0.25456446409225464, "learning_rate": 5.8450041016092464e-05, "loss": 0.9213, "step": 195 }, { "epoch": 0.0047469293300895984, "grad_norm": 0.4076145887374878, "learning_rate": 5.6350890987343944e-05, "loss": 0.884, "step": 200 }, { "epoch": 0.0047469293300895984, "eval_loss": 0.897860586643219, "eval_runtime": 510.3146, "eval_samples_per_second": 34.763, "eval_steps_per_second": 17.381, "step": 200 }, { "epoch": 0.004865602563341839, "grad_norm": 0.1094803586602211, "learning_rate": 5.4240296223775465e-05, "loss": 0.7631, "step": 205 }, { "epoch": 0.004984275796594078, "grad_norm": 0.14633063971996307, "learning_rate": 5.212206015980742e-05, "loss": 0.8653, "step": 210 }, { "epoch": 0.005102949029846318, "grad_norm": 0.12594805657863617, "learning_rate": 5e-05, "loss": 0.8278, "step": 215 }, { "epoch": 0.005221622263098558, "grad_norm": 0.1753065288066864, "learning_rate": 4.78779398401926e-05, "loss": 0.8959, "step": 220 }, { "epoch": 0.005340295496350798, "grad_norm": 0.1741386353969574, "learning_rate": 4.575970377622456e-05, "loss": 0.9233, "step": 225 }, { "epoch": 0.005458968729603038, "grad_norm": 0.19290021061897278, "learning_rate": 4.364910901265606e-05, "loss": 0.9547, "step": 230 }, { "epoch": 0.005577641962855278, "grad_norm": 0.22662971913814545, "learning_rate": 4.1549958983907555e-05, "loss": 0.9353, "step": 235 }, { "epoch": 0.005696315196107518, "grad_norm": 0.2653945982456207, "learning_rate": 3.94660365002137e-05, "loss": 0.9487, "step": 240 }, { "epoch": 0.005814988429359758, "grad_norm": 0.2608093321323395, "learning_rate": 3.740109693074375e-05, "loss": 0.9119, "step": 245 }, { "epoch": 0.005933661662611998, "grad_norm": 0.4665874242782593, "learning_rate": 3.5358861436172485e-05, "loss": 0.904, "step": 250 }, { "epoch": 0.006052334895864238, "grad_norm": 0.10215272009372711, "learning_rate": 3.334301026289712e-05, "loss": 0.7477, "step": 255 }, { "epoch": 0.0061710081291164775, "grad_norm": 0.1244097575545311, "learning_rate": 3.135717611098458e-05, "loss": 0.7905, "step": 260 }, { "epoch": 0.006289681362368718, "grad_norm": 0.16128472983837128, "learning_rate": 2.9404937587800375e-05, "loss": 0.8343, "step": 265 }, { "epoch": 0.006408354595620958, "grad_norm": 0.15076673030853271, "learning_rate": 2.748981275911633e-05, "loss": 0.7976, "step": 270 }, { "epoch": 0.006527027828873198, "grad_norm": 0.17758683860301971, "learning_rate": 2.5615252809318284e-05, "loss": 0.9116, "step": 275 }, { "epoch": 0.006645701062125438, "grad_norm": 0.22221983969211578, "learning_rate": 2.3784635822138424e-05, "loss": 0.9473, "step": 280 }, { "epoch": 0.006764374295377677, "grad_norm": 0.2009792923927307, "learning_rate": 2.2001260693120233e-05, "loss": 0.9147, "step": 285 }, { "epoch": 0.006883047528629917, "grad_norm": 0.24482344090938568, "learning_rate": 2.026834118478567e-05, "loss": 1.0044, "step": 290 }, { "epoch": 0.0070017207618821575, "grad_norm": 0.28756579756736755, "learning_rate": 1.858900013521788e-05, "loss": 1.0234, "step": 295 }, { "epoch": 0.007120393995134398, "grad_norm": 0.4237484931945801, "learning_rate": 1.6966263830495936e-05, "loss": 0.8254, "step": 300 }, { "epoch": 0.007120393995134398, "eval_loss": 0.890705406665802, "eval_runtime": 510.6082, "eval_samples_per_second": 34.743, "eval_steps_per_second": 17.371, "step": 300 }, { "epoch": 0.007239067228386638, "grad_norm": 0.09181042015552521, "learning_rate": 1.5403056551122697e-05, "loss": 0.7054, "step": 305 }, { "epoch": 0.007357740461638877, "grad_norm": 0.1410045176744461, "learning_rate": 1.3902195302273779e-05, "loss": 0.8037, "step": 310 }, { "epoch": 0.007476413694891117, "grad_norm": 0.1454688459634781, "learning_rate": 1.246638473736378e-05, "loss": 0.8922, "step": 315 }, { "epoch": 0.007595086928143357, "grad_norm": 0.17593573033809662, "learning_rate": 1.1098212284078036e-05, "loss": 0.9334, "step": 320 }, { "epoch": 0.0077137601613955975, "grad_norm": 0.18990731239318848, "learning_rate": 9.800143481652979e-06, "loss": 0.9446, "step": 325 }, { "epoch": 0.007832433394647837, "grad_norm": 0.1836443394422531, "learning_rate": 8.574517537807897e-06, "loss": 0.9762, "step": 330 }, { "epoch": 0.007951106627900077, "grad_norm": 0.23809511959552765, "learning_rate": 7.423543113334436e-06, "loss": 0.9713, "step": 335 }, { "epoch": 0.008069779861152317, "grad_norm": 0.23665811121463776, "learning_rate": 6.349294341940593e-06, "loss": 0.9209, "step": 340 }, { "epoch": 0.008188453094404557, "grad_norm": 0.2725285291671753, "learning_rate": 5.353707092521582e-06, "loss": 0.9288, "step": 345 }, { "epoch": 0.008307126327656797, "grad_norm": 0.4677680432796478, "learning_rate": 4.43857548059321e-06, "loss": 0.8756, "step": 350 }, { "epoch": 0.008425799560909037, "grad_norm": 0.10243045538663864, "learning_rate": 3.605548635174533e-06, "loss": 0.7671, "step": 355 }, { "epoch": 0.008544472794161278, "grad_norm": 0.1385096162557602, "learning_rate": 2.85612772694579e-06, "loss": 0.7777, "step": 360 }, { "epoch": 0.008663146027413518, "grad_norm": 0.13208024203777313, "learning_rate": 2.191663263037458e-06, "loss": 0.7676, "step": 365 }, { "epoch": 0.008781819260665756, "grad_norm": 0.16409678757190704, "learning_rate": 1.6133526533250565e-06, "loss": 0.8646, "step": 370 }, { "epoch": 0.008900492493917996, "grad_norm": 0.1728227734565735, "learning_rate": 1.1222380526156928e-06, "loss": 0.911, "step": 375 }, { "epoch": 0.009019165727170236, "grad_norm": 0.19500896334648132, "learning_rate": 7.192044826145771e-07, "loss": 0.9795, "step": 380 }, { "epoch": 0.009137838960422476, "grad_norm": 0.21884216368198395, "learning_rate": 4.049782370561583e-07, "loss": 0.8884, "step": 385 }, { "epoch": 0.009256512193674717, "grad_norm": 0.2533111572265625, "learning_rate": 1.8012557287367392e-07, "loss": 0.979, "step": 390 }, { "epoch": 0.009375185426926957, "grad_norm": 0.27901849150657654, "learning_rate": 4.5051689765929214e-08, "loss": 0.9472, "step": 395 }, { "epoch": 0.009493858660179197, "grad_norm": 0.41356849670410156, "learning_rate": 0.0, "loss": 0.7907, "step": 400 }, { "epoch": 0.009493858660179197, "eval_loss": 0.8899700045585632, "eval_runtime": 511.1186, "eval_samples_per_second": 34.708, "eval_steps_per_second": 17.354, "step": 400 } ], "logging_steps": 5, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6483000014929920.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }