{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9455676516329703, "eval_steps": 100, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 124.5927520751953, "epoch": 0.24883359253499224, "grad_norm": 15.078858375549316, "kl": 19.285888576432626, "learning_rate": 1.6666666666666667e-05, "loss": 0.7714, "reward": 0.40569198310840876, "reward_std": 0.30648858696222303, "rewards/accuracy_reward": 0.11316964841680602, "rewards/format_reward": 0.29252233491279184, "step": 10 }, { "completion_length": 104.66529519557953, "epoch": 0.4976671850699845, "grad_norm": 12.616145133972168, "kl": 2.282937492989004, "learning_rate": 1.973044870579824e-05, "loss": 0.0913, "reward": 1.0892857647966594, "reward_std": 0.24720423775725067, "rewards/accuracy_reward": 0.33191965956939384, "rewards/format_reward": 0.757366097078193, "step": 20 }, { "completion_length": 51.77388613224029, "epoch": 0.7465007776049767, "grad_norm": 6.543015480041504, "kl": 1.1295193679630757, "learning_rate": 1.866025403784439e-05, "loss": 0.0452, "reward": 1.6847098998725414, "reward_std": 0.24420843934640288, "rewards/accuracy_reward": 0.7239955674856902, "rewards/format_reward": 0.9607143193483353, "step": 30 }, { "completion_length": 127.24275121688842, "epoch": 0.995334370139969, "grad_norm": 0.9520509839057922, "kl": 1.2096525609493256, "learning_rate": 1.686241637868734e-05, "loss": 0.0484, "reward": 1.6343750745058059, "reward_std": 0.303326430125162, "rewards/accuracy_reward": 0.743191996961832, "rewards/format_reward": 0.8911830706521868, "step": 40 }, { "completion_length": 27.513606726717786, "epoch": 1.223950233281493, "grad_norm": 2.879532814025879, "kl": 1.3245808591648025, "learning_rate": 1.4487991802004625e-05, "loss": 0.0487, "reward": 1.8011419627131249, "reward_std": 0.12810517903290639, "rewards/accuracy_reward": 0.8123178200251391, "rewards/format_reward": 0.9888241238334552, "step": 50 }, { "completion_length": 49.2135066151619, "epoch": 1.4727838258164851, "grad_norm": 3.839486598968506, "kl": 1.5542371474206447, "learning_rate": 1.1736481776669307e-05, "loss": 0.0622, "reward": 1.67444204390049, "reward_std": 0.20320529993623496, "rewards/accuracy_reward": 0.7047991381958127, "rewards/format_reward": 0.9696428939700127, "step": 60 }, { "completion_length": 35.35312653779984, "epoch": 1.7216174183514774, "grad_norm": 12.68962287902832, "kl": 1.5803978390991689, "learning_rate": 8.839070858747697e-06, "loss": 0.0632, "reward": 1.7077009677886963, "reward_std": 0.19827912435866893, "rewards/accuracy_reward": 0.7397321753203869, "rewards/format_reward": 0.9679687764495611, "step": 70 }, { "completion_length": 48.37343968153, "epoch": 1.9704510108864697, "grad_norm": 1.9540529251098633, "kl": 1.5852291900664568, "learning_rate": 6.039202339608432e-06, "loss": 0.0634, "reward": 1.741852756589651, "reward_std": 0.2563208385836333, "rewards/accuracy_reward": 0.7801339650526643, "rewards/format_reward": 0.9617187809199095, "step": 80 }, { "completion_length": 39.21793181231233, "epoch": 2.1990668740279937, "grad_norm": 5.398323059082031, "kl": 1.3583059319022561, "learning_rate": 3.5721239031346067e-06, "loss": 0.0499, "reward": 1.8300535289608701, "reward_std": 0.13957984269070786, "rewards/accuracy_reward": 0.8479106251885291, "rewards/format_reward": 0.9821428830931787, "step": 90 }, { "completion_length": 69.5315878033638, "epoch": 2.447900466562986, "grad_norm": 4.969150543212891, "kl": 1.311043781042099, "learning_rate": 1.6451218858706374e-06, "loss": 0.0524, "reward": 1.771205434948206, "reward_std": 0.25496325781568885, "rewards/accuracy_reward": 0.818303607776761, "rewards/format_reward": 0.9529018204659223, "step": 100 }, { "completion_length": 40.709488260746, "epoch": 2.6967340590979783, "grad_norm": 0.640394389629364, "kl": 1.182990526035428, "learning_rate": 4.2010487684511105e-07, "loss": 0.0473, "reward": 1.8337054342031478, "reward_std": 0.13419676972553135, "rewards/accuracy_reward": 0.8508928921073675, "rewards/format_reward": 0.9828125294297934, "step": 110 }, { "completion_length": 36.980805253982545, "epoch": 2.9455676516329703, "grad_norm": 1.0733976364135742, "kl": 1.124643165245652, "learning_rate": 0.0, "loss": 0.045, "reward": 1.8347098976373672, "reward_std": 0.1128911126870662, "rewards/accuracy_reward": 0.8465402111411094, "rewards/format_reward": 0.9881696701049805, "step": 120 }, { "epoch": 2.9455676516329703, "step": 120, "total_flos": 0.0, "train_loss": 0.11570480888088544, "train_runtime": 16303.6208, "train_samples_per_second": 0.828, "train_steps_per_second": 0.007 } ], "logging_steps": 10, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }