|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03474222015791918, |
|
"eval_steps": 17, |
|
"global_step": 187, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00018578727357176033, |
|
"eval_loss": 1.5152398347854614, |
|
"eval_runtime": 275.8856, |
|
"eval_samples_per_second": 32.861, |
|
"eval_steps_per_second": 4.11, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.000557361820715281, |
|
"grad_norm": 4.119506359100342, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6013, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.001114723641430562, |
|
"grad_norm": 11.024518966674805, |
|
"learning_rate": 6e-05, |
|
"loss": 1.3784, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.001672085462145843, |
|
"grad_norm": 3.6604344844818115, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2692, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.002229447282861124, |
|
"grad_norm": 12.112958908081055, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 1.1006, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.002786809103576405, |
|
"grad_norm": 2.718080997467041, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 1.1317, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0031583836507199257, |
|
"eval_loss": 0.9914608597755432, |
|
"eval_runtime": 278.075, |
|
"eval_samples_per_second": 32.603, |
|
"eval_steps_per_second": 4.078, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.003344170924291686, |
|
"grad_norm": 1.8613860607147217, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 0.994, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0039015327450069672, |
|
"grad_norm": 1.679516315460205, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 0.8489, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.004458894565722248, |
|
"grad_norm": 1.1957541704177856, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 0.951, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.005016256386437529, |
|
"grad_norm": 0.9221785068511963, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 0.7897, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00557361820715281, |
|
"grad_norm": 1.2563328742980957, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.8182, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006130980027868091, |
|
"grad_norm": 1.1113193035125732, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 0.8223, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.006316767301439851, |
|
"eval_loss": 0.8207408785820007, |
|
"eval_runtime": 278.0489, |
|
"eval_samples_per_second": 32.606, |
|
"eval_steps_per_second": 4.078, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.006688341848583372, |
|
"grad_norm": 1.2708672285079956, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 0.8516, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.007245703669298653, |
|
"grad_norm": 1.0124180316925049, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 0.8231, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0078030654900139345, |
|
"grad_norm": 0.8520745038986206, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 0.7552, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.008360427310729215, |
|
"grad_norm": 1.1972272396087646, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 0.7681, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.008917789131444497, |
|
"grad_norm": 0.9560966491699219, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.7997, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.009475150952159776, |
|
"grad_norm": 0.8837025165557861, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 0.7901, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.009475150952159776, |
|
"eval_loss": 0.7770994305610657, |
|
"eval_runtime": 278.1693, |
|
"eval_samples_per_second": 32.592, |
|
"eval_steps_per_second": 4.077, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.010032512772875058, |
|
"grad_norm": 1.023627519607544, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 0.7579, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01058987459359034, |
|
"grad_norm": 0.8764439225196838, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 0.7208, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.01114723641430562, |
|
"grad_norm": 0.7949612140655518, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 0.7907, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0117045982350209, |
|
"grad_norm": 1.581484079360962, |
|
"learning_rate": 8.199842702516583e-05, |
|
"loss": 0.7894, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.012261960055736182, |
|
"grad_norm": 1.0892794132232666, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 0.8245, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.012633534602879703, |
|
"eval_loss": 0.7482102513313293, |
|
"eval_runtime": 278.4703, |
|
"eval_samples_per_second": 32.556, |
|
"eval_steps_per_second": 4.072, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.012819321876451463, |
|
"grad_norm": 0.9416138529777527, |
|
"learning_rate": 7.803575286758364e-05, |
|
"loss": 0.7452, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.013376683697166745, |
|
"grad_norm": 0.9427577257156372, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 0.7362, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.013934045517882025, |
|
"grad_norm": 0.8998479247093201, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 0.6653, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.014491407338597306, |
|
"grad_norm": 0.9068602919578552, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 0.7179, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.015048769159312587, |
|
"grad_norm": 0.8577080368995667, |
|
"learning_rate": 6.932495846462261e-05, |
|
"loss": 0.7525, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.015606130980027869, |
|
"grad_norm": 1.0340560674667358, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 0.7724, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.01579191825359963, |
|
"eval_loss": 0.7278433442115784, |
|
"eval_runtime": 278.2156, |
|
"eval_samples_per_second": 32.586, |
|
"eval_steps_per_second": 4.076, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01616349280074315, |
|
"grad_norm": 0.8426377773284912, |
|
"learning_rate": 6.466250186922325e-05, |
|
"loss": 0.751, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.01672085462145843, |
|
"grad_norm": 0.6842407584190369, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.6239, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01727821644217371, |
|
"grad_norm": 0.943516731262207, |
|
"learning_rate": 5.985585137257401e-05, |
|
"loss": 0.6827, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.017835578262888993, |
|
"grad_norm": 0.9946945905685425, |
|
"learning_rate": 5.74131823855921e-05, |
|
"loss": 0.6946, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.018392940083604273, |
|
"grad_norm": 0.9049971699714661, |
|
"learning_rate": 5.495227651252315e-05, |
|
"loss": 0.7255, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.018950301904319553, |
|
"grad_norm": 0.8010388016700745, |
|
"learning_rate": 5.247918773366112e-05, |
|
"loss": 0.7617, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.018950301904319553, |
|
"eval_loss": 0.7182794809341431, |
|
"eval_runtime": 278.3252, |
|
"eval_samples_per_second": 32.573, |
|
"eval_steps_per_second": 4.074, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.019507663725034836, |
|
"grad_norm": 0.8912818431854248, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6662, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.020065025545750115, |
|
"grad_norm": 0.8037520051002502, |
|
"learning_rate": 4.7520812266338885e-05, |
|
"loss": 0.655, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.0206223873664654, |
|
"grad_norm": 0.9647955894470215, |
|
"learning_rate": 4.504772348747687e-05, |
|
"loss": 0.7077, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.02117974918718068, |
|
"grad_norm": 0.7562959790229797, |
|
"learning_rate": 4.2586817614407895e-05, |
|
"loss": 0.6977, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.021737111007895958, |
|
"grad_norm": 0.7197637557983398, |
|
"learning_rate": 4.0144148627425993e-05, |
|
"loss": 0.7263, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.02210868555503948, |
|
"eval_loss": 0.7108221054077148, |
|
"eval_runtime": 278.2385, |
|
"eval_samples_per_second": 32.584, |
|
"eval_steps_per_second": 4.076, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.02229447282861124, |
|
"grad_norm": 0.7262760400772095, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 0.6754, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02285183464932652, |
|
"grad_norm": 0.6029306650161743, |
|
"learning_rate": 3.533749813077677e-05, |
|
"loss": 0.7083, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0234091964700418, |
|
"grad_norm": 0.7203994393348694, |
|
"learning_rate": 3.298534127791785e-05, |
|
"loss": 0.6682, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.023966558290757084, |
|
"grad_norm": 0.663855254650116, |
|
"learning_rate": 3.0675041535377405e-05, |
|
"loss": 0.6607, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.024523920111472364, |
|
"grad_norm": 1.1171367168426514, |
|
"learning_rate": 2.8412282383075363e-05, |
|
"loss": 0.7137, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.025081281932187643, |
|
"grad_norm": 0.8257487416267395, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 0.6731, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.025267069205759406, |
|
"eval_loss": 0.7051795125007629, |
|
"eval_runtime": 278.2312, |
|
"eval_samples_per_second": 32.584, |
|
"eval_steps_per_second": 4.076, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.025638643752902927, |
|
"grad_norm": 0.7469737529754639, |
|
"learning_rate": 2.405152131093926e-05, |
|
"loss": 0.66, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.026196005573618206, |
|
"grad_norm": 0.7631163001060486, |
|
"learning_rate": 2.196424713241637e-05, |
|
"loss": 0.7156, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.02675336739433349, |
|
"grad_norm": 0.8524065017700195, |
|
"learning_rate": 1.9945942635848748e-05, |
|
"loss": 0.6498, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.02731072921504877, |
|
"grad_norm": 0.7686144113540649, |
|
"learning_rate": 1.800157297483417e-05, |
|
"loss": 0.6891, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.02786809103576405, |
|
"grad_norm": 0.8990215063095093, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.583, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.028425452856479332, |
|
"grad_norm": 0.7455560564994812, |
|
"learning_rate": 1.435357758543015e-05, |
|
"loss": 0.6584, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.028425452856479332, |
|
"eval_loss": 0.7012729048728943, |
|
"eval_runtime": 278.4629, |
|
"eval_samples_per_second": 32.557, |
|
"eval_steps_per_second": 4.072, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.028982814677194612, |
|
"grad_norm": 0.7660591006278992, |
|
"learning_rate": 1.2658926150792322e-05, |
|
"loss": 0.6667, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.02954017649790989, |
|
"grad_norm": 0.6604006290435791, |
|
"learning_rate": 1.1056136061894384e-05, |
|
"loss": 0.6237, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.030097538318625175, |
|
"grad_norm": 0.8076897263526917, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.6899, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.030654900139340455, |
|
"grad_norm": 0.8138939738273621, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 0.606, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.031212261960055738, |
|
"grad_norm": 0.7922996878623962, |
|
"learning_rate": 6.837175952121306e-06, |
|
"loss": 0.7121, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.03158383650719926, |
|
"eval_loss": 0.6988500356674194, |
|
"eval_runtime": 278.3236, |
|
"eval_samples_per_second": 32.574, |
|
"eval_steps_per_second": 4.074, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.031769623780771014, |
|
"grad_norm": 0.8445965051651001, |
|
"learning_rate": 5.6388590278194096e-06, |
|
"loss": 0.6391, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0323269856014863, |
|
"grad_norm": 0.7874613404273987, |
|
"learning_rate": 4.549673247541875e-06, |
|
"loss": 0.7674, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.03288434742220158, |
|
"grad_norm": 0.8437705636024475, |
|
"learning_rate": 3.5722980755146517e-06, |
|
"loss": 0.6699, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.03344170924291686, |
|
"grad_norm": 0.7565687298774719, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 0.7346, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03399907106363214, |
|
"grad_norm": 0.7713425159454346, |
|
"learning_rate": 1.962316193157593e-06, |
|
"loss": 0.6421, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.03455643288434742, |
|
"grad_norm": 0.755294144153595, |
|
"learning_rate": 1.333670137599713e-06, |
|
"loss": 0.6981, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.03474222015791918, |
|
"eval_loss": 0.6977519392967224, |
|
"eval_runtime": 278.5744, |
|
"eval_samples_per_second": 32.544, |
|
"eval_steps_per_second": 4.071, |
|
"step": 187 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 17, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.567372765580493e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|