{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03474222015791918, "eval_steps": 17, "global_step": 187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018578727357176033, "eval_loss": 1.5152398347854614, "eval_runtime": 275.8856, "eval_samples_per_second": 32.861, "eval_steps_per_second": 4.11, "step": 1 }, { "epoch": 0.000557361820715281, "grad_norm": 4.119506359100342, "learning_rate": 3e-05, "loss": 1.6013, "step": 3 }, { "epoch": 0.001114723641430562, "grad_norm": 11.024518966674805, "learning_rate": 6e-05, "loss": 1.3784, "step": 6 }, { "epoch": 0.001672085462145843, "grad_norm": 3.6604344844818115, "learning_rate": 9e-05, "loss": 1.2692, "step": 9 }, { "epoch": 0.002229447282861124, "grad_norm": 12.112958908081055, "learning_rate": 9.997266286704631e-05, "loss": 1.1006, "step": 12 }, { "epoch": 0.002786809103576405, "grad_norm": 2.718080997467041, "learning_rate": 9.98292246503335e-05, "loss": 1.1317, "step": 15 }, { "epoch": 0.0031583836507199257, "eval_loss": 0.9914608597755432, "eval_runtime": 278.075, "eval_samples_per_second": 32.603, "eval_steps_per_second": 4.078, "step": 17 }, { "epoch": 0.003344170924291686, "grad_norm": 1.8613860607147217, "learning_rate": 9.956320346634876e-05, "loss": 0.994, "step": 18 }, { "epoch": 0.0039015327450069672, "grad_norm": 1.679516315460205, "learning_rate": 9.917525374361912e-05, "loss": 0.8489, "step": 21 }, { "epoch": 0.004458894565722248, "grad_norm": 1.1957541704177856, "learning_rate": 9.86663298624003e-05, "loss": 0.951, "step": 24 }, { "epoch": 0.005016256386437529, "grad_norm": 0.9221785068511963, "learning_rate": 9.803768380684242e-05, "loss": 0.7897, "step": 27 }, { "epoch": 0.00557361820715281, "grad_norm": 1.2563328742980957, "learning_rate": 9.729086208503174e-05, "loss": 0.8182, "step": 30 }, { "epoch": 0.006130980027868091, "grad_norm": 1.1113193035125732, "learning_rate": 9.642770192448536e-05, "loss": 0.8223, "step": 33 }, { "epoch": 0.006316767301439851, "eval_loss": 0.8207408785820007, "eval_runtime": 278.0489, "eval_samples_per_second": 32.606, "eval_steps_per_second": 4.078, "step": 34 }, { "epoch": 0.006688341848583372, "grad_norm": 1.2708672285079956, "learning_rate": 9.545032675245813e-05, "loss": 0.8516, "step": 36 }, { "epoch": 0.007245703669298653, "grad_norm": 1.0124180316925049, "learning_rate": 9.43611409721806e-05, "loss": 0.8231, "step": 39 }, { "epoch": 0.0078030654900139345, "grad_norm": 0.8520745038986206, "learning_rate": 9.316282404787871e-05, "loss": 0.7552, "step": 42 }, { "epoch": 0.008360427310729215, "grad_norm": 1.1972272396087646, "learning_rate": 9.185832391312644e-05, "loss": 0.7681, "step": 45 }, { "epoch": 0.008917789131444497, "grad_norm": 0.9560966491699219, "learning_rate": 9.045084971874738e-05, "loss": 0.7997, "step": 48 }, { "epoch": 0.009475150952159776, "grad_norm": 0.8837025165557861, "learning_rate": 8.894386393810563e-05, "loss": 0.7901, "step": 51 }, { "epoch": 0.009475150952159776, "eval_loss": 0.7770994305610657, "eval_runtime": 278.1693, "eval_samples_per_second": 32.592, "eval_steps_per_second": 4.077, "step": 51 }, { "epoch": 0.010032512772875058, "grad_norm": 1.023627519607544, "learning_rate": 8.73410738492077e-05, "loss": 0.7579, "step": 54 }, { "epoch": 0.01058987459359034, "grad_norm": 0.8764439225196838, "learning_rate": 8.564642241456986e-05, "loss": 0.7208, "step": 57 }, { "epoch": 0.01114723641430562, "grad_norm": 0.7949612140655518, "learning_rate": 8.386407858128706e-05, "loss": 0.7907, "step": 60 }, { "epoch": 0.0117045982350209, "grad_norm": 1.581484079360962, "learning_rate": 8.199842702516583e-05, "loss": 0.7894, "step": 63 }, { "epoch": 0.012261960055736182, "grad_norm": 1.0892794132232666, "learning_rate": 8.005405736415126e-05, "loss": 0.8245, "step": 66 }, { "epoch": 0.012633534602879703, "eval_loss": 0.7482102513313293, "eval_runtime": 278.4703, "eval_samples_per_second": 32.556, "eval_steps_per_second": 4.072, "step": 68 }, { "epoch": 0.012819321876451463, "grad_norm": 0.9416138529777527, "learning_rate": 7.803575286758364e-05, "loss": 0.7452, "step": 69 }, { "epoch": 0.013376683697166745, "grad_norm": 0.9427577257156372, "learning_rate": 7.594847868906076e-05, "loss": 0.7362, "step": 72 }, { "epoch": 0.013934045517882025, "grad_norm": 0.8998479247093201, "learning_rate": 7.379736965185368e-05, "loss": 0.6653, "step": 75 }, { "epoch": 0.014491407338597306, "grad_norm": 0.9068602919578552, "learning_rate": 7.158771761692464e-05, "loss": 0.7179, "step": 78 }, { "epoch": 0.015048769159312587, "grad_norm": 0.8577080368995667, "learning_rate": 6.932495846462261e-05, "loss": 0.7525, "step": 81 }, { "epoch": 0.015606130980027869, "grad_norm": 1.0340560674667358, "learning_rate": 6.701465872208216e-05, "loss": 0.7724, "step": 84 }, { "epoch": 0.01579191825359963, "eval_loss": 0.7278433442115784, "eval_runtime": 278.2156, "eval_samples_per_second": 32.586, "eval_steps_per_second": 4.076, "step": 85 }, { "epoch": 0.01616349280074315, "grad_norm": 0.8426377773284912, "learning_rate": 6.466250186922325e-05, "loss": 0.751, "step": 87 }, { "epoch": 0.01672085462145843, "grad_norm": 0.6842407584190369, "learning_rate": 6.227427435703997e-05, "loss": 0.6239, "step": 90 }, { "epoch": 0.01727821644217371, "grad_norm": 0.943516731262207, "learning_rate": 5.985585137257401e-05, "loss": 0.6827, "step": 93 }, { "epoch": 0.017835578262888993, "grad_norm": 0.9946945905685425, "learning_rate": 5.74131823855921e-05, "loss": 0.6946, "step": 96 }, { "epoch": 0.018392940083604273, "grad_norm": 0.9049971699714661, "learning_rate": 5.495227651252315e-05, "loss": 0.7255, "step": 99 }, { "epoch": 0.018950301904319553, "grad_norm": 0.8010388016700745, "learning_rate": 5.247918773366112e-05, "loss": 0.7617, "step": 102 }, { "epoch": 0.018950301904319553, "eval_loss": 0.7182794809341431, "eval_runtime": 278.3252, "eval_samples_per_second": 32.573, "eval_steps_per_second": 4.074, "step": 102 }, { "epoch": 0.019507663725034836, "grad_norm": 0.8912818431854248, "learning_rate": 5e-05, "loss": 0.6662, "step": 105 }, { "epoch": 0.020065025545750115, "grad_norm": 0.8037520051002502, "learning_rate": 4.7520812266338885e-05, "loss": 0.655, "step": 108 }, { "epoch": 0.0206223873664654, "grad_norm": 0.9647955894470215, "learning_rate": 4.504772348747687e-05, "loss": 0.7077, "step": 111 }, { "epoch": 0.02117974918718068, "grad_norm": 0.7562959790229797, "learning_rate": 4.2586817614407895e-05, "loss": 0.6977, "step": 114 }, { "epoch": 0.021737111007895958, "grad_norm": 0.7197637557983398, "learning_rate": 4.0144148627425993e-05, "loss": 0.7263, "step": 117 }, { "epoch": 0.02210868555503948, "eval_loss": 0.7108221054077148, "eval_runtime": 278.2385, "eval_samples_per_second": 32.584, "eval_steps_per_second": 4.076, "step": 119 }, { "epoch": 0.02229447282861124, "grad_norm": 0.7262760400772095, "learning_rate": 3.772572564296005e-05, "loss": 0.6754, "step": 120 }, { "epoch": 0.02285183464932652, "grad_norm": 0.6029306650161743, "learning_rate": 3.533749813077677e-05, "loss": 0.7083, "step": 123 }, { "epoch": 0.0234091964700418, "grad_norm": 0.7203994393348694, "learning_rate": 3.298534127791785e-05, "loss": 0.6682, "step": 126 }, { "epoch": 0.023966558290757084, "grad_norm": 0.663855254650116, "learning_rate": 3.0675041535377405e-05, "loss": 0.6607, "step": 129 }, { "epoch": 0.024523920111472364, "grad_norm": 1.1171367168426514, "learning_rate": 2.8412282383075363e-05, "loss": 0.7137, "step": 132 }, { "epoch": 0.025081281932187643, "grad_norm": 0.8257487416267395, "learning_rate": 2.6202630348146324e-05, "loss": 0.6731, "step": 135 }, { "epoch": 0.025267069205759406, "eval_loss": 0.7051795125007629, "eval_runtime": 278.2312, "eval_samples_per_second": 32.584, "eval_steps_per_second": 4.076, "step": 136 }, { "epoch": 0.025638643752902927, "grad_norm": 0.7469737529754639, "learning_rate": 2.405152131093926e-05, "loss": 0.66, "step": 138 }, { "epoch": 0.026196005573618206, "grad_norm": 0.7631163001060486, "learning_rate": 2.196424713241637e-05, "loss": 0.7156, "step": 141 }, { "epoch": 0.02675336739433349, "grad_norm": 0.8524065017700195, "learning_rate": 1.9945942635848748e-05, "loss": 0.6498, "step": 144 }, { "epoch": 0.02731072921504877, "grad_norm": 0.7686144113540649, "learning_rate": 1.800157297483417e-05, "loss": 0.6891, "step": 147 }, { "epoch": 0.02786809103576405, "grad_norm": 0.8990215063095093, "learning_rate": 1.6135921418712956e-05, "loss": 0.583, "step": 150 }, { "epoch": 0.028425452856479332, "grad_norm": 0.7455560564994812, "learning_rate": 1.435357758543015e-05, "loss": 0.6584, "step": 153 }, { "epoch": 0.028425452856479332, "eval_loss": 0.7012729048728943, "eval_runtime": 278.4629, "eval_samples_per_second": 32.557, "eval_steps_per_second": 4.072, "step": 153 }, { "epoch": 0.028982814677194612, "grad_norm": 0.7660591006278992, "learning_rate": 1.2658926150792322e-05, "loss": 0.6667, "step": 156 }, { "epoch": 0.02954017649790989, "grad_norm": 0.6604006290435791, "learning_rate": 1.1056136061894384e-05, "loss": 0.6237, "step": 159 }, { "epoch": 0.030097538318625175, "grad_norm": 0.8076897263526917, "learning_rate": 9.549150281252633e-06, "loss": 0.6899, "step": 162 }, { "epoch": 0.030654900139340455, "grad_norm": 0.8138939738273621, "learning_rate": 8.141676086873572e-06, "loss": 0.606, "step": 165 }, { "epoch": 0.031212261960055738, "grad_norm": 0.7922996878623962, "learning_rate": 6.837175952121306e-06, "loss": 0.7121, "step": 168 }, { "epoch": 0.03158383650719926, "eval_loss": 0.6988500356674194, "eval_runtime": 278.3236, "eval_samples_per_second": 32.574, "eval_steps_per_second": 4.074, "step": 170 }, { "epoch": 0.031769623780771014, "grad_norm": 0.8445965051651001, "learning_rate": 5.6388590278194096e-06, "loss": 0.6391, "step": 171 }, { "epoch": 0.0323269856014863, "grad_norm": 0.7874613404273987, "learning_rate": 4.549673247541875e-06, "loss": 0.7674, "step": 174 }, { "epoch": 0.03288434742220158, "grad_norm": 0.8437705636024475, "learning_rate": 3.5722980755146517e-06, "loss": 0.6699, "step": 177 }, { "epoch": 0.03344170924291686, "grad_norm": 0.7565687298774719, "learning_rate": 2.7091379149682685e-06, "loss": 0.7346, "step": 180 }, { "epoch": 0.03399907106363214, "grad_norm": 0.7713425159454346, "learning_rate": 1.962316193157593e-06, "loss": 0.6421, "step": 183 }, { "epoch": 0.03455643288434742, "grad_norm": 0.755294144153595, "learning_rate": 1.333670137599713e-06, "loss": 0.6981, "step": 186 }, { "epoch": 0.03474222015791918, "eval_loss": 0.6977519392967224, "eval_runtime": 278.5744, "eval_samples_per_second": 32.544, "eval_steps_per_second": 4.071, "step": 187 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 17, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.567372765580493e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }