|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.16532341392849761, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004133085348212441, |
|
"eval_loss": 1.703283429145813, |
|
"eval_runtime": 130.4689, |
|
"eval_samples_per_second": 7.81, |
|
"eval_steps_per_second": 3.909, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0020665426741062203, |
|
"grad_norm": 1.026974081993103, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.6789, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0041330853482124405, |
|
"grad_norm": 0.8587822318077087, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.5597, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006199628022318661, |
|
"grad_norm": 1.4193518161773682, |
|
"learning_rate": 5e-05, |
|
"loss": 1.853, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008266170696424881, |
|
"grad_norm": 1.894394040107727, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.6418, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010332713370531101, |
|
"grad_norm": 1.5550564527511597, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.3564, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012399256044637322, |
|
"grad_norm": 2.1608707904815674, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7369, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014465798718743542, |
|
"grad_norm": 1.5144590139389038, |
|
"learning_rate": 9.995494831023409e-05, |
|
"loss": 1.0681, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.016532341392849762, |
|
"grad_norm": 1.3637126684188843, |
|
"learning_rate": 9.981987442712633e-05, |
|
"loss": 0.9692, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018598884066955982, |
|
"grad_norm": 1.5348516702651978, |
|
"learning_rate": 9.959502176294383e-05, |
|
"loss": 1.2228, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.020665426741062202, |
|
"grad_norm": 2.122760534286499, |
|
"learning_rate": 9.928079551738543e-05, |
|
"loss": 0.9679, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02273196941516842, |
|
"grad_norm": 1.2727136611938477, |
|
"learning_rate": 9.887776194738432e-05, |
|
"loss": 1.1951, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.024798512089274645, |
|
"grad_norm": 0.9782228469848633, |
|
"learning_rate": 9.838664734667495e-05, |
|
"loss": 0.9479, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026865054763380865, |
|
"grad_norm": 1.4281666278839111, |
|
"learning_rate": 9.780833673696254e-05, |
|
"loss": 0.817, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.028931597437487085, |
|
"grad_norm": 1.0529621839523315, |
|
"learning_rate": 9.714387227305422e-05, |
|
"loss": 0.83, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.030998140111593304, |
|
"grad_norm": 1.318174958229065, |
|
"learning_rate": 9.639445136482548e-05, |
|
"loss": 0.9337, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.033064682785699524, |
|
"grad_norm": 1.516811728477478, |
|
"learning_rate": 9.55614245194068e-05, |
|
"loss": 0.8914, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.035131225459805744, |
|
"grad_norm": 1.2559555768966675, |
|
"learning_rate": 9.464629290747842e-05, |
|
"loss": 0.8251, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.037197768133911964, |
|
"grad_norm": 1.7071189880371094, |
|
"learning_rate": 9.365070565805941e-05, |
|
"loss": 0.9287, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.039264310808018184, |
|
"grad_norm": 1.842026710510254, |
|
"learning_rate": 9.257645688666556e-05, |
|
"loss": 0.8464, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.041330853482124404, |
|
"grad_norm": 2.377781867980957, |
|
"learning_rate": 9.142548246219212e-05, |
|
"loss": 0.7113, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.041330853482124404, |
|
"eval_loss": 0.8705713748931885, |
|
"eval_runtime": 132.3524, |
|
"eval_samples_per_second": 7.699, |
|
"eval_steps_per_second": 3.853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04339739615623062, |
|
"grad_norm": 1.1244537830352783, |
|
"learning_rate": 9.019985651834703e-05, |
|
"loss": 1.0297, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04546393883033684, |
|
"grad_norm": 0.866783857345581, |
|
"learning_rate": 8.890178771592199e-05, |
|
"loss": 0.6887, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04753048150444307, |
|
"grad_norm": 1.2207870483398438, |
|
"learning_rate": 8.753361526263621e-05, |
|
"loss": 0.9184, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04959702417854929, |
|
"grad_norm": 1.4106080532073975, |
|
"learning_rate": 8.609780469772623e-05, |
|
"loss": 1.0763, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05166356685265551, |
|
"grad_norm": 1.3751602172851562, |
|
"learning_rate": 8.459694344887732e-05, |
|
"loss": 0.822, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05373010952676173, |
|
"grad_norm": 1.327123999595642, |
|
"learning_rate": 8.303373616950408e-05, |
|
"loss": 0.817, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05579665220086795, |
|
"grad_norm": 1.3972790241241455, |
|
"learning_rate": 8.141099986478212e-05, |
|
"loss": 1.0442, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05786319487497417, |
|
"grad_norm": 1.3133134841918945, |
|
"learning_rate": 7.973165881521434e-05, |
|
"loss": 0.7807, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05992973754908039, |
|
"grad_norm": 1.2477699518203735, |
|
"learning_rate": 7.799873930687978e-05, |
|
"loss": 0.8035, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06199628022318661, |
|
"grad_norm": 2.311676025390625, |
|
"learning_rate": 7.621536417786159e-05, |
|
"loss": 0.7261, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06406282289729283, |
|
"grad_norm": 1.013773798942566, |
|
"learning_rate": 7.438474719068173e-05, |
|
"loss": 0.8035, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06612936557139905, |
|
"grad_norm": 1.2594728469848633, |
|
"learning_rate": 7.251018724088367e-05, |
|
"loss": 1.2357, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06819590824550527, |
|
"grad_norm": 1.3990025520324707, |
|
"learning_rate": 7.059506241219965e-05, |
|
"loss": 0.9476, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07026245091961149, |
|
"grad_norm": 1.3124955892562866, |
|
"learning_rate": 6.864282388901544e-05, |
|
"loss": 1.0208, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07232899359371771, |
|
"grad_norm": 1.2700549364089966, |
|
"learning_rate": 6.665698973710288e-05, |
|
"loss": 0.8046, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07439553626782393, |
|
"grad_norm": 0.9456565380096436, |
|
"learning_rate": 6.464113856382752e-05, |
|
"loss": 0.9784, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07646207894193015, |
|
"grad_norm": 1.3909025192260742, |
|
"learning_rate": 6.259890306925627e-05, |
|
"loss": 0.8895, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07852862161603637, |
|
"grad_norm": 1.400960087776184, |
|
"learning_rate": 6.0533963499786314e-05, |
|
"loss": 0.6296, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08059516429014259, |
|
"grad_norm": 1.4584358930587769, |
|
"learning_rate": 5.8450041016092464e-05, |
|
"loss": 0.8387, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08266170696424881, |
|
"grad_norm": 1.6872780323028564, |
|
"learning_rate": 5.6350890987343944e-05, |
|
"loss": 0.6882, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08266170696424881, |
|
"eval_loss": 0.8160419464111328, |
|
"eval_runtime": 132.2449, |
|
"eval_samples_per_second": 7.705, |
|
"eval_steps_per_second": 3.856, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08472824963835503, |
|
"grad_norm": 0.7275150418281555, |
|
"learning_rate": 5.4240296223775465e-05, |
|
"loss": 1.1285, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08679479231246125, |
|
"grad_norm": 1.6055395603179932, |
|
"learning_rate": 5.212206015980742e-05, |
|
"loss": 1.0795, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08886133498656747, |
|
"grad_norm": 1.6460407972335815, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7667, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09092787766067369, |
|
"grad_norm": 1.2784044742584229, |
|
"learning_rate": 4.78779398401926e-05, |
|
"loss": 1.0481, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0929944203347799, |
|
"grad_norm": 1.3674898147583008, |
|
"learning_rate": 4.575970377622456e-05, |
|
"loss": 0.7733, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09506096300888614, |
|
"grad_norm": 1.7782163619995117, |
|
"learning_rate": 4.364910901265606e-05, |
|
"loss": 1.075, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09712750568299236, |
|
"grad_norm": 1.6913050413131714, |
|
"learning_rate": 4.1549958983907555e-05, |
|
"loss": 0.8413, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09919404835709858, |
|
"grad_norm": 1.4041593074798584, |
|
"learning_rate": 3.94660365002137e-05, |
|
"loss": 0.7328, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1012605910312048, |
|
"grad_norm": 1.8579920530319214, |
|
"learning_rate": 3.740109693074375e-05, |
|
"loss": 0.6929, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.10332713370531102, |
|
"grad_norm": 1.874348759651184, |
|
"learning_rate": 3.5358861436172485e-05, |
|
"loss": 0.628, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10539367637941724, |
|
"grad_norm": 1.1385961771011353, |
|
"learning_rate": 3.334301026289712e-05, |
|
"loss": 1.0175, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.10746021905352346, |
|
"grad_norm": 1.177496075630188, |
|
"learning_rate": 3.135717611098458e-05, |
|
"loss": 0.8475, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10952676172762968, |
|
"grad_norm": 0.819813072681427, |
|
"learning_rate": 2.9404937587800375e-05, |
|
"loss": 0.7876, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1115933044017359, |
|
"grad_norm": 1.1457738876342773, |
|
"learning_rate": 2.748981275911633e-05, |
|
"loss": 0.7686, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11365984707584212, |
|
"grad_norm": 1.3542187213897705, |
|
"learning_rate": 2.5615252809318284e-05, |
|
"loss": 0.9639, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11572638974994834, |
|
"grad_norm": 1.3880482912063599, |
|
"learning_rate": 2.3784635822138424e-05, |
|
"loss": 0.6768, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11779293242405456, |
|
"grad_norm": 1.575758934020996, |
|
"learning_rate": 2.2001260693120233e-05, |
|
"loss": 0.8948, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.11985947509816078, |
|
"grad_norm": 1.2389308214187622, |
|
"learning_rate": 2.026834118478567e-05, |
|
"loss": 0.7186, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.121926017772267, |
|
"grad_norm": 1.2766398191452026, |
|
"learning_rate": 1.858900013521788e-05, |
|
"loss": 0.693, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12399256044637322, |
|
"grad_norm": 1.8179432153701782, |
|
"learning_rate": 1.6966263830495936e-05, |
|
"loss": 0.6462, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12399256044637322, |
|
"eval_loss": 0.7967092990875244, |
|
"eval_runtime": 132.1257, |
|
"eval_samples_per_second": 7.712, |
|
"eval_steps_per_second": 3.86, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12605910312047944, |
|
"grad_norm": 0.7972692847251892, |
|
"learning_rate": 1.5403056551122697e-05, |
|
"loss": 0.9078, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.12812564579458566, |
|
"grad_norm": 1.3046032190322876, |
|
"learning_rate": 1.3902195302273779e-05, |
|
"loss": 1.0395, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13019218846869188, |
|
"grad_norm": 1.3980432748794556, |
|
"learning_rate": 1.246638473736378e-05, |
|
"loss": 1.1, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1322587311427981, |
|
"grad_norm": 1.0495986938476562, |
|
"learning_rate": 1.1098212284078036e-05, |
|
"loss": 0.7195, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13432527381690432, |
|
"grad_norm": 1.4138555526733398, |
|
"learning_rate": 9.800143481652979e-06, |
|
"loss": 0.7553, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.13639181649101054, |
|
"grad_norm": 1.1162981986999512, |
|
"learning_rate": 8.574517537807897e-06, |
|
"loss": 0.7799, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13845835916511676, |
|
"grad_norm": 1.5671554803848267, |
|
"learning_rate": 7.423543113334436e-06, |
|
"loss": 0.6911, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.14052490183922298, |
|
"grad_norm": 2.3479394912719727, |
|
"learning_rate": 6.349294341940593e-06, |
|
"loss": 0.8078, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1425914445133292, |
|
"grad_norm": 1.5182538032531738, |
|
"learning_rate": 5.353707092521582e-06, |
|
"loss": 0.648, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14465798718743542, |
|
"grad_norm": 2.3883490562438965, |
|
"learning_rate": 4.43857548059321e-06, |
|
"loss": 0.9168, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14672452986154164, |
|
"grad_norm": 1.0066900253295898, |
|
"learning_rate": 3.605548635174533e-06, |
|
"loss": 0.9429, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.14879107253564786, |
|
"grad_norm": 1.1294912099838257, |
|
"learning_rate": 2.85612772694579e-06, |
|
"loss": 0.9248, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15085761520975408, |
|
"grad_norm": 0.8687814474105835, |
|
"learning_rate": 2.191663263037458e-06, |
|
"loss": 0.5667, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1529241578838603, |
|
"grad_norm": 1.3007910251617432, |
|
"learning_rate": 1.6133526533250565e-06, |
|
"loss": 0.7796, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15499070055796652, |
|
"grad_norm": 1.1952354907989502, |
|
"learning_rate": 1.1222380526156928e-06, |
|
"loss": 0.8654, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.15705724323207274, |
|
"grad_norm": 1.6118597984313965, |
|
"learning_rate": 7.192044826145771e-07, |
|
"loss": 0.6932, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15912378590617896, |
|
"grad_norm": 1.714866042137146, |
|
"learning_rate": 4.049782370561583e-07, |
|
"loss": 0.7152, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16119032858028517, |
|
"grad_norm": 1.3845038414001465, |
|
"learning_rate": 1.8012557287367392e-07, |
|
"loss": 0.8241, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1632568712543914, |
|
"grad_norm": 1.519347071647644, |
|
"learning_rate": 4.5051689765929214e-08, |
|
"loss": 0.8239, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.16532341392849761, |
|
"grad_norm": 2.4190309047698975, |
|
"learning_rate": 0.0, |
|
"loss": 0.5018, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16532341392849761, |
|
"eval_loss": 0.7933396100997925, |
|
"eval_runtime": 132.3088, |
|
"eval_samples_per_second": 7.702, |
|
"eval_steps_per_second": 3.855, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.53296434479104e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|