{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 11.057634185929018, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.832641363143921, "logits/rejected": -2.7308225631713867, "logps/chosen": -194.5078887939453, "logps/pi_response": -109.15676879882812, "logps/ref_response": -109.15676879882812, "logps/rejected": -199.842041015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 15.370165930636123, "learning_rate": 4.930057285201027e-07, "logits/chosen": -2.751119613647461, "logits/rejected": -2.7138187885284424, "logps/chosen": -213.61614990234375, "logps/pi_response": -115.39461517333984, "logps/ref_response": -114.79134368896484, "logps/rejected": -267.3037109375, "loss": 0.6847, "rewards/accuracies": 0.5763888955116272, "rewards/chosen": -0.03307180106639862, "rewards/margins": 0.02672014944255352, "rewards/rejected": -0.05979194864630699, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 30.006490022587922, "learning_rate": 4.187457503795526e-07, "logits/chosen": -2.7758188247680664, "logits/rejected": -2.7319676876068115, "logps/chosen": -258.4744567871094, "logps/pi_response": -113.13661193847656, "logps/ref_response": -115.21757507324219, "logps/rejected": -313.4935302734375, "loss": 0.6311, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2657850384712219, "rewards/margins": 0.28187114000320435, "rewards/rejected": -0.5476561784744263, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 17.86539531293398, "learning_rate": 2.8691164100062034e-07, "logits/chosen": -2.7661945819854736, "logits/rejected": -2.7283473014831543, "logps/chosen": -281.109130859375, "logps/pi_response": -136.8625030517578, "logps/ref_response": -118.77275085449219, "logps/rejected": -353.06475830078125, "loss": 0.5838, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4710591733455658, "rewards/margins": 0.4681057035923004, "rewards/rejected": -0.9391648173332214, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 16.943629934390696, "learning_rate": 1.4248369943086995e-07, "logits/chosen": -2.6898460388183594, "logits/rejected": -2.651970624923706, "logps/chosen": -281.537841796875, "logps/pi_response": -143.15576171875, "logps/ref_response": -110.78315734863281, "logps/rejected": -359.834716796875, "loss": 0.5584, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.5756246447563171, "rewards/margins": 0.49899721145629883, "rewards/rejected": -1.0746219158172607, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 16.37436004148144, "learning_rate": 3.473909705816111e-08, "logits/chosen": -2.6713178157806396, "logits/rejected": -2.6225244998931885, "logps/chosen": -272.2515869140625, "logps/pi_response": -146.99404907226562, "logps/ref_response": -112.6738510131836, "logps/rejected": -340.9408874511719, "loss": 0.5611, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.5049266815185547, "rewards/margins": 0.4622128903865814, "rewards/rejected": -0.9671396017074585, "step": 50 }, { "epoch": 0.9874476987447699, "step": 59, "total_flos": 0.0, "train_loss": 0.5940128989138845, "train_runtime": 2688.5086, "train_samples_per_second": 5.685, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }