{ "best_metric": 0.630431056022644, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.021073483236044084, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.214696647208817e-05, "eval_loss": 1.147399663925171, "eval_runtime": 871.891, "eval_samples_per_second": 11.458, "eval_steps_per_second": 2.865, "step": 1 }, { "epoch": 0.0004214696647208817, "grad_norm": 0.7626272439956665, "learning_rate": 4.34e-05, "loss": 1.0628, "step": 10 }, { "epoch": 0.0008429393294417634, "grad_norm": 0.4994406998157501, "learning_rate": 8.68e-05, "loss": 0.856, "step": 20 }, { "epoch": 0.001264408994162645, "grad_norm": 0.5089827179908752, "learning_rate": 0.0001302, "loss": 0.7376, "step": 30 }, { "epoch": 0.0016858786588835268, "grad_norm": 0.6126852035522461, "learning_rate": 0.0001736, "loss": 0.7691, "step": 40 }, { "epoch": 0.0021073483236044087, "grad_norm": 1.0332469940185547, "learning_rate": 0.000217, "loss": 0.7831, "step": 50 }, { "epoch": 0.0021073483236044087, "eval_loss": 0.7892972230911255, "eval_runtime": 872.9595, "eval_samples_per_second": 11.444, "eval_steps_per_second": 2.862, "step": 50 }, { "epoch": 0.00252881798832529, "grad_norm": 0.43384820222854614, "learning_rate": 0.00021673569945319091, "loss": 0.7715, "step": 60 }, { "epoch": 0.002950287653046172, "grad_norm": 0.4282907545566559, "learning_rate": 0.00021594408545846038, "loss": 0.6761, "step": 70 }, { "epoch": 0.0033717573177670535, "grad_norm": 0.4396986663341522, "learning_rate": 0.0002146290146796179, "loss": 0.6714, "step": 80 }, { "epoch": 0.0037932269824879354, "grad_norm": 0.5730768442153931, "learning_rate": 0.0002127968940093076, "loss": 0.7565, "step": 90 }, { "epoch": 0.004214696647208817, "grad_norm": 0.9540401697158813, "learning_rate": 0.00021045664935527106, "loss": 0.7393, "step": 100 }, { "epoch": 0.004214696647208817, "eval_loss": 0.7602397203445435, "eval_runtime": 872.7942, "eval_samples_per_second": 11.446, "eval_steps_per_second": 2.862, "step": 100 }, { "epoch": 0.004636166311929699, "grad_norm": 0.3995174765586853, "learning_rate": 0.00020761968215422217, "loss": 0.7404, "step": 110 }, { "epoch": 0.00505763597665058, "grad_norm": 0.4123441278934479, "learning_rate": 0.00020429981382519356, "loss": 0.6722, "step": 120 }, { "epoch": 0.005479105641371463, "grad_norm": 0.4265460968017578, "learning_rate": 0.00020051321843297219, "loss": 0.6447, "step": 130 }, { "epoch": 0.005900575306092344, "grad_norm": 0.5936018228530884, "learning_rate": 0.0001962783438896818, "loss": 0.7289, "step": 140 }, { "epoch": 0.006322044970813226, "grad_norm": 1.0116862058639526, "learning_rate": 0.0001916158220784091, "loss": 0.7043, "step": 150 }, { "epoch": 0.006322044970813226, "eval_loss": 0.7188068628311157, "eval_runtime": 872.6433, "eval_samples_per_second": 11.448, "eval_steps_per_second": 2.863, "step": 150 }, { "epoch": 0.006743514635534107, "grad_norm": 0.4303455054759979, "learning_rate": 0.00018654836833674362, "loss": 0.7164, "step": 160 }, { "epoch": 0.007164984300254989, "grad_norm": 0.42676424980163574, "learning_rate": 0.0001811006707899361, "loss": 0.6661, "step": 170 }, { "epoch": 0.007586453964975871, "grad_norm": 0.44973692297935486, "learning_rate": 0.0001752992700728339, "loss": 0.6661, "step": 180 }, { "epoch": 0.008007923629696753, "grad_norm": 0.48232847452163696, "learning_rate": 0.00016917243002657602, "loss": 0.6969, "step": 190 }, { "epoch": 0.008429393294417635, "grad_norm": 0.9322578310966492, "learning_rate": 0.00016275, "loss": 0.6935, "step": 200 }, { "epoch": 0.008429393294417635, "eval_loss": 0.7024884819984436, "eval_runtime": 872.9871, "eval_samples_per_second": 11.443, "eval_steps_per_second": 2.861, "step": 200 }, { "epoch": 0.008850862959138516, "grad_norm": 0.42416146397590637, "learning_rate": 0.0001560632694266149, "loss": 0.6823, "step": 210 }, { "epoch": 0.009272332623859398, "grad_norm": 0.394248902797699, "learning_rate": 0.00014914481538562646, "loss": 0.6591, "step": 220 }, { "epoch": 0.009693802288580279, "grad_norm": 0.40907424688339233, "learning_rate": 0.0001420283438896818, "loss": 0.6353, "step": 230 }, { "epoch": 0.01011527195330116, "grad_norm": 0.49951237440109253, "learning_rate": 0.00013474852567256393, "loss": 0.6632, "step": 240 }, { "epoch": 0.010536741618022042, "grad_norm": 1.0042486190795898, "learning_rate": 0.00012734082727686196, "loss": 0.6702, "step": 250 }, { "epoch": 0.010536741618022042, "eval_loss": 0.6787152290344238, "eval_runtime": 872.8552, "eval_samples_per_second": 11.445, "eval_steps_per_second": 2.862, "step": 250 }, { "epoch": 0.010958211282742925, "grad_norm": 0.4140114188194275, "learning_rate": 0.0001198413382645404, "loss": 0.6869, "step": 260 }, { "epoch": 0.011379680947463807, "grad_norm": 0.3980177640914917, "learning_rate": 0.00011228659539222137, "loss": 0.618, "step": 270 }, { "epoch": 0.011801150612184688, "grad_norm": 0.4262223243713379, "learning_rate": 0.00010471340460777866, "loss": 0.6145, "step": 280 }, { "epoch": 0.01222262027690557, "grad_norm": 0.5590672492980957, "learning_rate": 9.715866173545961e-05, "loss": 0.6731, "step": 290 }, { "epoch": 0.012644089941626451, "grad_norm": 0.9287014603614807, "learning_rate": 8.965917272313806e-05, "loss": 0.6508, "step": 300 }, { "epoch": 0.012644089941626451, "eval_loss": 0.6602598428726196, "eval_runtime": 873.3259, "eval_samples_per_second": 11.439, "eval_steps_per_second": 2.86, "step": 300 }, { "epoch": 0.013065559606347333, "grad_norm": 0.41679131984710693, "learning_rate": 8.225147432743606e-05, "loss": 0.658, "step": 310 }, { "epoch": 0.013487029271068214, "grad_norm": 0.4045983552932739, "learning_rate": 7.497165611031821e-05, "loss": 0.6414, "step": 320 }, { "epoch": 0.013908498935789097, "grad_norm": 0.4106775224208832, "learning_rate": 6.785518461437353e-05, "loss": 0.6159, "step": 330 }, { "epoch": 0.014329968600509979, "grad_norm": 0.5271579623222351, "learning_rate": 6.093673057338509e-05, "loss": 0.6481, "step": 340 }, { "epoch": 0.01475143826523086, "grad_norm": 0.8985204100608826, "learning_rate": 5.4250000000000024e-05, "loss": 0.6159, "step": 350 }, { "epoch": 0.01475143826523086, "eval_loss": 0.6442619562149048, "eval_runtime": 873.0849, "eval_samples_per_second": 11.442, "eval_steps_per_second": 2.861, "step": 350 }, { "epoch": 0.015172907929951742, "grad_norm": 0.41902175545692444, "learning_rate": 4.782756997342398e-05, "loss": 0.6228, "step": 360 }, { "epoch": 0.015594377594672623, "grad_norm": 0.3683434724807739, "learning_rate": 4.170072992716607e-05, "loss": 0.6158, "step": 370 }, { "epoch": 0.016015847259393506, "grad_norm": 0.4270426630973816, "learning_rate": 3.5899329210063916e-05, "loss": 0.6093, "step": 380 }, { "epoch": 0.016437316924114386, "grad_norm": 0.5241418480873108, "learning_rate": 3.045163166325637e-05, "loss": 0.662, "step": 390 }, { "epoch": 0.01685878658883527, "grad_norm": 0.7835712432861328, "learning_rate": 2.5384177921590895e-05, "loss": 0.6354, "step": 400 }, { "epoch": 0.01685878658883527, "eval_loss": 0.635778546333313, "eval_runtime": 873.4484, "eval_samples_per_second": 11.437, "eval_steps_per_second": 2.86, "step": 400 }, { "epoch": 0.01728025625355615, "grad_norm": 0.4076845645904541, "learning_rate": 2.0721656110318213e-05, "loss": 0.6454, "step": 410 }, { "epoch": 0.017701725918277032, "grad_norm": 0.4060816466808319, "learning_rate": 1.6486781567027783e-05, "loss": 0.6029, "step": 420 }, { "epoch": 0.018123195582997912, "grad_norm": 0.43102189898490906, "learning_rate": 1.2700186174806422e-05, "loss": 0.5974, "step": 430 }, { "epoch": 0.018544665247718795, "grad_norm": 0.515192985534668, "learning_rate": 9.380317845777794e-06, "loss": 0.6454, "step": 440 }, { "epoch": 0.01896613491243968, "grad_norm": 0.9731060266494751, "learning_rate": 6.543350644728947e-06, "loss": 0.6225, "step": 450 }, { "epoch": 0.01896613491243968, "eval_loss": 0.6311845779418945, "eval_runtime": 873.5871, "eval_samples_per_second": 11.436, "eval_steps_per_second": 2.859, "step": 450 }, { "epoch": 0.019387604577160558, "grad_norm": 0.38442152738571167, "learning_rate": 4.2031059906924e-06, "loss": 0.6438, "step": 460 }, { "epoch": 0.01980907424188144, "grad_norm": 0.38169050216674805, "learning_rate": 2.3709853203820825e-06, "loss": 0.6287, "step": 470 }, { "epoch": 0.02023054390660232, "grad_norm": 0.4052588641643524, "learning_rate": 1.0559145415396157e-06, "loss": 0.6131, "step": 480 }, { "epoch": 0.020652013571323204, "grad_norm": 0.47564056515693665, "learning_rate": 2.643005468090745e-07, "loss": 0.6782, "step": 490 }, { "epoch": 0.021073483236044084, "grad_norm": 0.9834715127944946, "learning_rate": 0.0, "loss": 0.6781, "step": 500 }, { "epoch": 0.021073483236044084, "eval_loss": 0.630431056022644, "eval_runtime": 874.2378, "eval_samples_per_second": 11.427, "eval_steps_per_second": 2.857, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3554433904345088e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }