diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6403 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6155917425310937, + "eval_steps": 10, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004615976407231696, + "grad_norm": 60.83765068742266, + "learning_rate": 1.1494252873563218e-08, + "logits/chosen": 0.4711977541446686, + "logits/rejected": 0.4847034811973572, + "logps/chosen": -41.84939193725586, + "logps/rejected": -44.508792877197266, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.009231952814463392, + "grad_norm": 89.49857504360673, + "learning_rate": 2.2988505747126436e-08, + "logits/chosen": 0.4102262556552887, + "logits/rejected": 0.4489870071411133, + "logps/chosen": -33.33359909057617, + "logps/rejected": -48.11466979980469, + "loss": 0.6965, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": 0.016991177573800087, + "rewards/margins": -0.00249446090310812, + "rewards/rejected": 0.01948563940823078, + "step": 4 + }, + { + "epoch": 0.01384792922169509, + "grad_norm": 91.503921376188, + "learning_rate": 3.448275862068965e-08, + "logits/chosen": 0.4212642312049866, + "logits/rejected": 0.448761522769928, + "logps/chosen": -39.75364685058594, + "logps/rejected": -51.98044967651367, + "loss": 0.7058, + "rewards/accuracies": 0.4305555522441864, + "rewards/chosen": 0.01352924108505249, + "rewards/margins": -0.021095700562000275, + "rewards/rejected": 0.034624941647052765, + "step": 6 + }, + { + "epoch": 0.018463905628926785, + "grad_norm": 74.67414376851612, + "learning_rate": 4.597701149425287e-08, + "logits/chosen": 0.3533351719379425, + "logits/rejected": 0.38716739416122437, + "logps/chosen": -42.66749954223633, + "logps/rejected": -59.93525695800781, + "loss": 0.682, + "rewards/accuracies": 0.5972222089767456, + "rewards/chosen": 0.15689438581466675, + "rewards/margins": 0.02920585870742798, + "rewards/rejected": 0.12768852710723877, + "step": 8 + }, + { + "epoch": 0.023079882036158482, + "grad_norm": 68.70984683419653, + "learning_rate": 5.747126436781609e-08, + "logits/chosen": 0.49728691577911377, + "logits/rejected": 0.5158182978630066, + "logps/chosen": -40.442108154296875, + "logps/rejected": -47.894962310791016, + "loss": 0.6784, + "rewards/accuracies": 0.5833333134651184, + "rewards/chosen": 0.20257243514060974, + "rewards/margins": 0.033093564212322235, + "rewards/rejected": 0.1694788932800293, + "step": 10 + }, + { + "epoch": 0.023079882036158482, + "eval_logits/chosen": 0.3284655511379242, + "eval_logits/rejected": 0.3523290753364563, + "eval_logps/chosen": -41.368160247802734, + "eval_logps/rejected": -47.68316650390625, + "eval_loss": 0.6900005340576172, + "eval_rewards/accuracies": 0.5040322542190552, + "eval_rewards/chosen": 0.18856020271778107, + "eval_rewards/margins": 0.010975954122841358, + "eval_rewards/rejected": 0.1775842159986496, + "eval_runtime": 223.5149, + "eval_samples_per_second": 7.758, + "eval_steps_per_second": 1.942, + "step": 10 + }, + { + "epoch": 0.02769585844339018, + "grad_norm": 83.08821357925031, + "learning_rate": 6.89655172413793e-08, + "logits/chosen": 0.39168137311935425, + "logits/rejected": 0.428312748670578, + "logps/chosen": -40.189659118652344, + "logps/rejected": -55.229732513427734, + "loss": 0.6825, + "rewards/accuracies": 0.5138888955116272, + "rewards/chosen": 0.10889428108930588, + "rewards/margins": 0.025906018912792206, + "rewards/rejected": 0.08298826217651367, + "step": 12 + }, + { + "epoch": 0.032311834850621876, + "grad_norm": 83.2788469670015, + "learning_rate": 8.045977011494252e-08, + "logits/chosen": 0.4244603216648102, + "logits/rejected": 0.45606857538223267, + "logps/chosen": -45.81875228881836, + "logps/rejected": -59.79555130004883, + "loss": 0.707, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.0007680323324166238, + "rewards/margins": -0.02245757356286049, + "rewards/rejected": 0.023225605487823486, + "step": 14 + }, + { + "epoch": 0.03692781125785357, + "grad_norm": 65.27464739827121, + "learning_rate": 9.195402298850574e-08, + "logits/chosen": 0.43778783082962036, + "logits/rejected": 0.47771337628364563, + "logps/chosen": -33.643489837646484, + "logps/rejected": -47.315940856933594, + "loss": 0.6907, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.16155114769935608, + "rewards/margins": 0.009526676498353481, + "rewards/rejected": 0.15202444791793823, + "step": 16 + }, + { + "epoch": 0.04154378766508527, + "grad_norm": 60.46601141846051, + "learning_rate": 1.0344827586206897e-07, + "logits/chosen": 0.4576772153377533, + "logits/rejected": 0.4669303894042969, + "logps/chosen": -49.01601791381836, + "logps/rejected": -44.165489196777344, + "loss": 0.7024, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": 0.1639019399881363, + "rewards/margins": -0.013081331737339497, + "rewards/rejected": 0.1769832819700241, + "step": 18 + }, + { + "epoch": 0.046159764072316964, + "grad_norm": 79.31933330847342, + "learning_rate": 1.1494252873563217e-07, + "logits/chosen": 0.40101033449172974, + "logits/rejected": 0.4429229199886322, + "logps/chosen": -42.295860290527344, + "logps/rejected": -61.62363052368164, + "loss": 0.6993, + "rewards/accuracies": 0.5694444179534912, + "rewards/chosen": 0.1744026243686676, + "rewards/margins": -0.005564332008361816, + "rewards/rejected": 0.17996692657470703, + "step": 20 + }, + { + "epoch": 0.046159764072316964, + "eval_logits/chosen": 0.3300890624523163, + "eval_logits/rejected": 0.3539319634437561, + "eval_logps/chosen": -41.36879348754883, + "eval_logps/rejected": -47.67192840576172, + "eval_loss": 0.6927710771560669, + "eval_rewards/accuracies": 0.4694700539112091, + "eval_rewards/chosen": 0.1882432997226715, + "eval_rewards/margins": 0.005040565971285105, + "eval_rewards/rejected": 0.18320275843143463, + "eval_runtime": 220.5959, + "eval_samples_per_second": 7.861, + "eval_steps_per_second": 1.967, + "step": 20 + }, + { + "epoch": 0.05077574047954866, + "grad_norm": 74.95614553584633, + "learning_rate": 1.2643678160919542e-07, + "logits/chosen": 0.35644879937171936, + "logits/rejected": 0.39824995398521423, + "logps/chosen": -44.09666442871094, + "logps/rejected": -67.98532104492188, + "loss": 0.6849, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.061192478984594345, + "rewards/margins": 0.024405598640441895, + "rewards/rejected": 0.03678688034415245, + "step": 22 + }, + { + "epoch": 0.05539171688678036, + "grad_norm": 59.95529358393387, + "learning_rate": 1.379310344827586e-07, + "logits/chosen": 0.4076593816280365, + "logits/rejected": 0.4187220335006714, + "logps/chosen": -50.34169006347656, + "logps/rejected": -52.33488464355469, + "loss": 0.6737, + "rewards/accuracies": 0.5972222089767456, + "rewards/chosen": 0.18035806715488434, + "rewards/margins": 0.046983275562524796, + "rewards/rejected": 0.13337479531764984, + "step": 24 + }, + { + "epoch": 0.06000769329401205, + "grad_norm": 58.82337491053465, + "learning_rate": 1.4942528735632184e-07, + "logits/chosen": 0.38400429487228394, + "logits/rejected": 0.3896331191062927, + "logps/chosen": -45.30482482910156, + "logps/rejected": -38.63485336303711, + "loss": 0.6927, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.12515152990818024, + "rewards/margins": 0.005594419315457344, + "rewards/rejected": 0.11955711245536804, + "step": 26 + }, + { + "epoch": 0.06462366970124375, + "grad_norm": 79.835671840217, + "learning_rate": 1.6091954022988505e-07, + "logits/chosen": 0.38133352994918823, + "logits/rejected": 0.4193841814994812, + "logps/chosen": -46.66801452636719, + "logps/rejected": -66.68572998046875, + "loss": 0.6817, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.1615171581506729, + "rewards/margins": 0.03017430752515793, + "rewards/rejected": 0.1313428282737732, + "step": 28 + }, + { + "epoch": 0.06923964610847544, + "grad_norm": 63.4551490148668, + "learning_rate": 1.7241379310344828e-07, + "logits/chosen": 0.38135284185409546, + "logits/rejected": 0.4095006585121155, + "logps/chosen": -40.06434631347656, + "logps/rejected": -49.53153610229492, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18324324488639832, + "rewards/margins": 0.04405728355050087, + "rewards/rejected": 0.13918595016002655, + "step": 30 + }, + { + "epoch": 0.06923964610847544, + "eval_logits/chosen": 0.33052363991737366, + "eval_logits/rejected": 0.3544217050075531, + "eval_logps/chosen": -41.5091438293457, + "eval_logps/rejected": -47.86111068725586, + "eval_loss": 0.6812014579772949, + "eval_rewards/accuracies": 0.546658992767334, + "eval_rewards/chosen": 0.11806601285934448, + "eval_rewards/margins": 0.029455602169036865, + "eval_rewards/rejected": 0.08861041069030762, + "eval_runtime": 220.5898, + "eval_samples_per_second": 7.861, + "eval_steps_per_second": 1.967, + "step": 30 + }, + { + "epoch": 0.07385562251570714, + "grad_norm": 54.00177702879831, + "learning_rate": 1.839080459770115e-07, + "logits/chosen": 0.4309755563735962, + "logits/rejected": 0.45285335183143616, + "logps/chosen": -42.45962905883789, + "logps/rejected": -47.46916198730469, + "loss": 0.6778, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.059671804308891296, + "rewards/margins": 0.03700065612792969, + "rewards/rejected": 0.022671150043606758, + "step": 32 + }, + { + "epoch": 0.07847159892293884, + "grad_norm": 55.987754524641964, + "learning_rate": 1.9540229885057472e-07, + "logits/chosen": 0.3958838880062103, + "logits/rejected": 0.43136459589004517, + "logps/chosen": -37.61958694458008, + "logps/rejected": -52.296146392822266, + "loss": 0.6756, + "rewards/accuracies": 0.5138888955116272, + "rewards/chosen": 0.2221948355436325, + "rewards/margins": 0.04236772283911705, + "rewards/rejected": 0.17982712388038635, + "step": 34 + }, + { + "epoch": 0.08308757533017054, + "grad_norm": 67.31410028619514, + "learning_rate": 2.0689655172413793e-07, + "logits/chosen": 0.44812121987342834, + "logits/rejected": 0.46431127190589905, + "logps/chosen": -42.98078155517578, + "logps/rejected": -41.65153884887695, + "loss": 0.6493, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": 0.30534830689430237, + "rewards/margins": 0.09969804435968399, + "rewards/rejected": 0.20565026998519897, + "step": 36 + }, + { + "epoch": 0.08770355173740223, + "grad_norm": 57.904024813693326, + "learning_rate": 2.1839080459770114e-07, + "logits/chosen": 0.49128374457359314, + "logits/rejected": 0.5145975351333618, + "logps/chosen": -44.50560760498047, + "logps/rejected": -49.38070297241211, + "loss": 0.6628, + "rewards/accuracies": 0.5972222089767456, + "rewards/chosen": 0.18226391077041626, + "rewards/margins": 0.0719500482082367, + "rewards/rejected": 0.11031384021043777, + "step": 38 + }, + { + "epoch": 0.09231952814463393, + "grad_norm": 64.05496800782066, + "learning_rate": 2.2988505747126435e-07, + "logits/chosen": 0.46414005756378174, + "logits/rejected": 0.47909700870513916, + "logps/chosen": -45.80656433105469, + "logps/rejected": -48.13614273071289, + "loss": 0.6614, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": 0.1019454374909401, + "rewards/margins": 0.07323868572711945, + "rewards/rejected": 0.028706755489110947, + "step": 40 + }, + { + "epoch": 0.09231952814463393, + "eval_logits/chosen": 0.3304091989994049, + "eval_logits/rejected": 0.35432368516921997, + "eval_logps/chosen": -41.51032638549805, + "eval_logps/rejected": -47.951194763183594, + "eval_loss": 0.6623325347900391, + "eval_rewards/accuracies": 0.5748847723007202, + "eval_rewards/chosen": 0.11747448146343231, + "eval_rewards/margins": 0.07390521466732025, + "eval_rewards/rejected": 0.04356926307082176, + "eval_runtime": 220.5888, + "eval_samples_per_second": 7.861, + "eval_steps_per_second": 1.967, + "step": 40 + }, + { + "epoch": 0.09693550455186563, + "grad_norm": 57.52177717893154, + "learning_rate": 2.413793103448276e-07, + "logits/chosen": 0.40689817070961, + "logits/rejected": 0.427402138710022, + "logps/chosen": -38.75439453125, + "logps/rejected": -44.31669235229492, + "loss": 0.6302, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.2862703502178192, + "rewards/margins": 0.14747940003871918, + "rewards/rejected": 0.13879093527793884, + "step": 42 + }, + { + "epoch": 0.10155148095909731, + "grad_norm": 64.43087177898828, + "learning_rate": 2.5287356321839084e-07, + "logits/chosen": 0.38502392172813416, + "logits/rejected": 0.42915642261505127, + "logps/chosen": -44.23611831665039, + "logps/rejected": -70.150634765625, + "loss": 0.6523, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.29370981454849243, + "rewards/margins": 0.11090421676635742, + "rewards/rejected": 0.18280558288097382, + "step": 44 + }, + { + "epoch": 0.10616745736632902, + "grad_norm": 60.64457511313282, + "learning_rate": 2.64367816091954e-07, + "logits/chosen": 0.4625084698200226, + "logits/rejected": 0.47940170764923096, + "logps/chosen": -47.40989685058594, + "logps/rejected": -50.2266731262207, + "loss": 0.6586, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.19395428895950317, + "rewards/margins": 0.09819034487009048, + "rewards/rejected": 0.09576395153999329, + "step": 46 + }, + { + "epoch": 0.11078343377356072, + "grad_norm": 48.97275141927136, + "learning_rate": 2.758620689655172e-07, + "logits/chosen": 0.40377330780029297, + "logits/rejected": 0.4251302480697632, + "logps/chosen": -40.91835021972656, + "logps/rejected": -46.69221878051758, + "loss": 0.6553, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.19001546502113342, + "rewards/margins": 0.11491294950246811, + "rewards/rejected": 0.07510250806808472, + "step": 48 + }, + { + "epoch": 0.1153994101807924, + "grad_norm": 50.405334242894924, + "learning_rate": 2.873563218390804e-07, + "logits/chosen": 0.42986366152763367, + "logits/rejected": 0.4425734579563141, + "logps/chosen": -45.240882873535156, + "logps/rejected": -45.33219528198242, + "loss": 0.6545, + "rewards/accuracies": 0.5972222089767456, + "rewards/chosen": 0.27720221877098083, + "rewards/margins": 0.10149689018726349, + "rewards/rejected": 0.17570529878139496, + "step": 50 + }, + { + "epoch": 0.1153994101807924, + "eval_logits/chosen": 0.3320508301258087, + "eval_logits/rejected": 0.35591834783554077, + "eval_logps/chosen": -41.36655807495117, + "eval_logps/rejected": -48.028892517089844, + "eval_loss": 0.6255878210067749, + "eval_rewards/accuracies": 0.6278801560401917, + "eval_rewards/chosen": 0.1893603652715683, + "eval_rewards/margins": 0.18464061617851257, + "eval_rewards/rejected": 0.004719759337604046, + "eval_runtime": 220.5277, + "eval_samples_per_second": 7.863, + "eval_steps_per_second": 1.968, + "step": 50 + }, + { + "epoch": 0.1200153865880241, + "grad_norm": 59.45036930086866, + "learning_rate": 2.988505747126437e-07, + "logits/chosen": 0.4412320852279663, + "logits/rejected": 0.47745391726493835, + "logps/chosen": -38.808204650878906, + "logps/rejected": -57.61214828491211, + "loss": 0.6523, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.24403540790081024, + "rewards/margins": 0.14384596049785614, + "rewards/rejected": 0.10018942505121231, + "step": 52 + }, + { + "epoch": 0.1246313629952558, + "grad_norm": 54.50350019894718, + "learning_rate": 3.103448275862069e-07, + "logits/chosen": 0.305615097284317, + "logits/rejected": 0.3378358781337738, + "logps/chosen": -41.46311950683594, + "logps/rejected": -55.873138427734375, + "loss": 0.6345, + "rewards/accuracies": 0.5694444179534912, + "rewards/chosen": 0.2176976054906845, + "rewards/margins": 0.17212893068790436, + "rewards/rejected": 0.045568663626909256, + "step": 54 + }, + { + "epoch": 0.1292473394024875, + "grad_norm": 49.044811443941626, + "learning_rate": 3.218390804597701e-07, + "logits/chosen": 0.4806426763534546, + "logits/rejected": 0.5007810592651367, + "logps/chosen": -37.00300216674805, + "logps/rejected": -42.795040130615234, + "loss": 0.6005, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.41853082180023193, + "rewards/margins": 0.23157899081707, + "rewards/rejected": 0.18695180118083954, + "step": 56 + }, + { + "epoch": 0.1338633158097192, + "grad_norm": 54.19272761171978, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": 0.4073159098625183, + "logits/rejected": 0.4315372109413147, + "logps/chosen": -39.63461685180664, + "logps/rejected": -41.75359344482422, + "loss": 0.5767, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.488341748714447, + "rewards/margins": 0.2964838743209839, + "rewards/rejected": 0.19185791909694672, + "step": 58 + }, + { + "epoch": 0.13847929221695088, + "grad_norm": 45.35161413256781, + "learning_rate": 3.4482758620689656e-07, + "logits/chosen": 0.3869187831878662, + "logits/rejected": 0.4154462218284607, + "logps/chosen": -40.21774673461914, + "logps/rejected": -49.05698013305664, + "loss": 0.5939, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.4058813452720642, + "rewards/margins": 0.2916874289512634, + "rewards/rejected": 0.11419390141963959, + "step": 60 + }, + { + "epoch": 0.13847929221695088, + "eval_logits/chosen": 0.3351307511329651, + "eval_logits/rejected": 0.35898876190185547, + "eval_logps/chosen": -40.8883171081543, + "eval_logps/rejected": -47.73066711425781, + "eval_loss": 0.5981891751289368, + "eval_rewards/accuracies": 0.6745391488075256, + "eval_rewards/chosen": 0.4284805655479431, + "eval_rewards/margins": 0.2746467888355255, + "eval_rewards/rejected": 0.15383380651474, + "eval_runtime": 220.5776, + "eval_samples_per_second": 7.861, + "eval_steps_per_second": 1.968, + "step": 60 + }, + { + "epoch": 0.1430952686241826, + "grad_norm": 43.3905484769897, + "learning_rate": 3.5632183908045977e-07, + "logits/chosen": 0.4458725154399872, + "logits/rejected": 0.46262863278388977, + "logps/chosen": -40.7205924987793, + "logps/rejected": -47.21548080444336, + "loss": 0.6053, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 0.42287477850914, + "rewards/margins": 0.2391357719898224, + "rewards/rejected": 0.18373897671699524, + "step": 62 + }, + { + "epoch": 0.14771124503141428, + "grad_norm": 46.32576522285691, + "learning_rate": 3.67816091954023e-07, + "logits/chosen": 0.42775771021842957, + "logits/rejected": 0.4581214487552643, + "logps/chosen": -42.59015655517578, + "logps/rejected": -51.6392822265625, + "loss": 0.5669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43508684635162354, + "rewards/margins": 0.3611146807670593, + "rewards/rejected": 0.0739721804857254, + "step": 64 + }, + { + "epoch": 0.152327221438646, + "grad_norm": 42.51968356117282, + "learning_rate": 3.793103448275862e-07, + "logits/chosen": 0.4170711636543274, + "logits/rejected": 0.45475757122039795, + "logps/chosen": -38.8193359375, + "logps/rejected": -59.24808120727539, + "loss": 0.5514, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.5248206257820129, + "rewards/margins": 0.48520517349243164, + "rewards/rejected": 0.03961547836661339, + "step": 66 + }, + { + "epoch": 0.15694319784587768, + "grad_norm": 59.1347578756685, + "learning_rate": 3.9080459770114945e-07, + "logits/chosen": 0.3444980978965759, + "logits/rejected": 0.38142290711402893, + "logps/chosen": -37.63268280029297, + "logps/rejected": -56.55868911743164, + "loss": 0.6421, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": 0.28356897830963135, + "rewards/margins": 0.37217438220977783, + "rewards/rejected": -0.0886053591966629, + "step": 68 + }, + { + "epoch": 0.16155917425310937, + "grad_norm": 43.01512643240851, + "learning_rate": 4.0229885057471266e-07, + "logits/chosen": 0.47459664940834045, + "logits/rejected": 0.5048218369483948, + "logps/chosen": -37.06074905395508, + "logps/rejected": -41.83311462402344, + "loss": 0.4985, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7238032221794128, + "rewards/margins": 0.6140663623809814, + "rewards/rejected": 0.10973668098449707, + "step": 70 + }, + { + "epoch": 0.16155917425310937, + "eval_logits/chosen": 0.3373754024505615, + "eval_logits/rejected": 0.361337274312973, + "eval_logps/chosen": -40.72093200683594, + "eval_logps/rejected": -47.9627685546875, + "eval_loss": 0.5673334002494812, + "eval_rewards/accuracies": 0.7073732614517212, + "eval_rewards/chosen": 0.512172520160675, + "eval_rewards/margins": 0.47438928484916687, + "eval_rewards/rejected": 0.03778325766324997, + "eval_runtime": 220.4667, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.969, + "step": 70 + }, + { + "epoch": 0.16617515066034108, + "grad_norm": 58.03313651952633, + "learning_rate": 4.1379310344827586e-07, + "logits/chosen": 0.47328370809555054, + "logits/rejected": 0.516916036605835, + "logps/chosen": -39.381927490234375, + "logps/rejected": -63.04606628417969, + "loss": 0.5446, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.5364831686019897, + "rewards/margins": 0.5998459458351135, + "rewards/rejected": -0.06336280703544617, + "step": 72 + }, + { + "epoch": 0.17079112706757277, + "grad_norm": 38.72118579621577, + "learning_rate": 4.25287356321839e-07, + "logits/chosen": 0.4743606746196747, + "logits/rejected": 0.48414406180381775, + "logps/chosen": -47.13395690917969, + "logps/rejected": -47.23988723754883, + "loss": 0.6296, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.46243613958358765, + "rewards/margins": 0.4000816345214844, + "rewards/rejected": 0.06235449016094208, + "step": 74 + }, + { + "epoch": 0.17540710347480445, + "grad_norm": 52.02457163056824, + "learning_rate": 4.367816091954023e-07, + "logits/chosen": 0.4869605302810669, + "logits/rejected": 0.5183277726173401, + "logps/chosen": -41.5470085144043, + "logps/rejected": -52.64150619506836, + "loss": 0.5554, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": 0.6152575612068176, + "rewards/margins": 0.5033391118049622, + "rewards/rejected": 0.11191850155591965, + "step": 76 + }, + { + "epoch": 0.18002307988203617, + "grad_norm": 39.942306949985145, + "learning_rate": 4.482758620689655e-07, + "logits/chosen": 0.4678427278995514, + "logits/rejected": 0.4919649660587311, + "logps/chosen": -36.33488845825195, + "logps/rejected": -46.28294372558594, + "loss": 0.5509, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.7630440592765808, + "rewards/margins": 0.5350204110145569, + "rewards/rejected": 0.22802363336086273, + "step": 78 + }, + { + "epoch": 0.18463905628926786, + "grad_norm": 49.68265630941736, + "learning_rate": 4.597701149425287e-07, + "logits/chosen": 0.4118601381778717, + "logits/rejected": 0.4340742528438568, + "logps/chosen": -36.466026306152344, + "logps/rejected": -40.359230041503906, + "loss": 0.5161, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.7488323450088501, + "rewards/margins": 0.6430253982543945, + "rewards/rejected": 0.1058068722486496, + "step": 80 + }, + { + "epoch": 0.18463905628926786, + "eval_logits/chosen": 0.3424670994281769, + "eval_logits/rejected": 0.3665529489517212, + "eval_logps/chosen": -40.603668212890625, + "eval_logps/rejected": -48.121337890625, + "eval_loss": 0.5314013957977295, + "eval_rewards/accuracies": 0.7206221222877502, + "eval_rewards/chosen": 0.5708039999008179, + "eval_rewards/margins": 0.6123039126396179, + "eval_rewards/rejected": -0.04149990156292915, + "eval_runtime": 220.269, + "eval_samples_per_second": 7.872, + "eval_steps_per_second": 1.97, + "step": 80 + }, + { + "epoch": 0.18925503269649954, + "grad_norm": 40.18854063526523, + "learning_rate": 4.712643678160919e-07, + "logits/chosen": 0.4146896302700043, + "logits/rejected": 0.44372716546058655, + "logps/chosen": -44.112205505371094, + "logps/rejected": -54.97979736328125, + "loss": 0.5066, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.5255990624427795, + "rewards/margins": 0.6622204780578613, + "rewards/rejected": -0.1366213709115982, + "step": 82 + }, + { + "epoch": 0.19387100910373126, + "grad_norm": 35.8737356471814, + "learning_rate": 4.827586206896552e-07, + "logits/chosen": 0.46901315450668335, + "logits/rejected": 0.5202505588531494, + "logps/chosen": -37.11308288574219, + "logps/rejected": -64.51854705810547, + "loss": 0.4791, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.6412889957427979, + "rewards/margins": 0.8545607924461365, + "rewards/rejected": -0.213271826505661, + "step": 84 + }, + { + "epoch": 0.19848698551096294, + "grad_norm": 38.06201763208911, + "learning_rate": 4.942528735632184e-07, + "logits/chosen": 0.4869195520877838, + "logits/rejected": 0.5158190727233887, + "logps/chosen": -41.02754592895508, + "logps/rejected": -52.270511627197266, + "loss": 0.4592, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.6756889820098877, + "rewards/margins": 0.815551221370697, + "rewards/rejected": -0.13986223936080933, + "step": 86 + }, + { + "epoch": 0.20310296191819463, + "grad_norm": 36.34414677933188, + "learning_rate": 4.999979670146248e-07, + "logits/chosen": 0.4226466119289398, + "logits/rejected": 0.4440222680568695, + "logps/chosen": -45.02897644042969, + "logps/rejected": -53.814674377441406, + "loss": 0.4665, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5262306928634644, + "rewards/margins": 0.772502601146698, + "rewards/rejected": -0.24627192318439484, + "step": 88 + }, + { + "epoch": 0.20771893832542634, + "grad_norm": 47.62232189356859, + "learning_rate": 4.99981703330008e-07, + "logits/chosen": 0.43458905816078186, + "logits/rejected": 0.45631253719329834, + "logps/chosen": -39.44232177734375, + "logps/rejected": -49.5074462890625, + "loss": 0.508, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.6604294180870056, + "rewards/margins": 0.6735664010047913, + "rewards/rejected": -0.013136889785528183, + "step": 90 + }, + { + "epoch": 0.20771893832542634, + "eval_logits/chosen": 0.3509339988231659, + "eval_logits/rejected": 0.37502503395080566, + "eval_logps/chosen": -40.31688690185547, + "eval_logps/rejected": -48.16210174560547, + "eval_loss": 0.4914422631263733, + "eval_rewards/accuracies": 0.7263824939727783, + "eval_rewards/chosen": 0.7141958475112915, + "eval_rewards/margins": 0.7760785222053528, + "eval_rewards/rejected": -0.06188271939754486, + "eval_runtime": 220.2045, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 90 + }, + { + "epoch": 0.21233491473265803, + "grad_norm": 33.53538998473167, + "learning_rate": 4.99949177018813e-07, + "logits/chosen": 0.4293578863143921, + "logits/rejected": 0.46211349964141846, + "logps/chosen": -34.20891571044922, + "logps/rejected": -45.82402420043945, + "loss": 0.4007, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.9528428316116333, + "rewards/margins": 1.0176244974136353, + "rewards/rejected": -0.06478171050548553, + "step": 92 + }, + { + "epoch": 0.21695089113988972, + "grad_norm": 47.45137697847349, + "learning_rate": 4.999003901970474e-07, + "logits/chosen": 0.4385245442390442, + "logits/rejected": 0.45108774304389954, + "logps/chosen": -47.24710464477539, + "logps/rejected": -47.30147171020508, + "loss": 0.5534, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.644627034664154, + "rewards/margins": 0.6343204975128174, + "rewards/rejected": 0.010306484065949917, + "step": 94 + }, + { + "epoch": 0.22156686754712143, + "grad_norm": 33.39629568865361, + "learning_rate": 4.998353460385512e-07, + "logits/chosen": 0.4504711329936981, + "logits/rejected": 0.48663392663002014, + "logps/chosen": -40.03446578979492, + "logps/rejected": -55.506591796875, + "loss": 0.4222, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.6967446804046631, + "rewards/margins": 1.0778069496154785, + "rewards/rejected": -0.381062388420105, + "step": 96 + }, + { + "epoch": 0.22618284395435312, + "grad_norm": 34.18594601316725, + "learning_rate": 4.997540487747892e-07, + "logits/chosen": 0.38444679975509644, + "logits/rejected": 0.4130491614341736, + "logps/chosen": -37.72957992553711, + "logps/rejected": -57.71113967895508, + "loss": 0.4716, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.864948570728302, + "rewards/margins": 1.0170652866363525, + "rewards/rejected": -0.152116596698761, + "step": 98 + }, + { + "epoch": 0.2307988203615848, + "grad_norm": 31.852168197293704, + "learning_rate": 4.996565036945769e-07, + "logits/chosen": 0.4658397436141968, + "logits/rejected": 0.4849558472633362, + "logps/chosen": -44.069618225097656, + "logps/rejected": -46.06491470336914, + "loss": 0.4924, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.5598255395889282, + "rewards/margins": 0.8147852420806885, + "rewards/rejected": -0.25495976209640503, + "step": 100 + }, + { + "epoch": 0.2307988203615848, + "eval_logits/chosen": 0.3590577244758606, + "eval_logits/rejected": 0.38313183188438416, + "eval_logps/chosen": -40.04033660888672, + "eval_logps/rejected": -48.23354721069336, + "eval_loss": 0.4618569314479828, + "eval_rewards/accuracies": 0.7298387289047241, + "eval_rewards/chosen": 0.852470874786377, + "eval_rewards/margins": 0.9500778913497925, + "eval_rewards/rejected": -0.09760700911283493, + "eval_runtime": 220.4716, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.969, + "step": 100 + }, + { + "epoch": 0.23541479676881652, + "grad_norm": 32.563251146586694, + "learning_rate": 4.995427171437356e-07, + "logits/chosen": 0.41394177079200745, + "logits/rejected": 0.4560126066207886, + "logps/chosen": -36.68212890625, + "logps/rejected": -56.006553649902344, + "loss": 0.3851, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.7943739891052246, + "rewards/margins": 1.1945956945419312, + "rewards/rejected": -0.40022173523902893, + "step": 102 + }, + { + "epoch": 0.2400307731760482, + "grad_norm": 35.159104202159625, + "learning_rate": 4.994126965246796e-07, + "logits/chosen": 0.43339937925338745, + "logits/rejected": 0.45789778232574463, + "logps/chosen": -40.00631332397461, + "logps/rejected": -48.161224365234375, + "loss": 0.4153, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.7441484928131104, + "rewards/margins": 1.0307202339172363, + "rewards/rejected": -0.28657177090644836, + "step": 104 + }, + { + "epoch": 0.24464674958327992, + "grad_norm": 35.54279884741835, + "learning_rate": 4.992664502959351e-07, + "logits/chosen": 0.42503511905670166, + "logits/rejected": 0.48626741766929626, + "logps/chosen": -36.73310852050781, + "logps/rejected": -73.78736877441406, + "loss": 0.3536, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.8739730715751648, + "rewards/margins": 1.558694839477539, + "rewards/rejected": -0.6847219467163086, + "step": 106 + }, + { + "epoch": 0.2492627259905116, + "grad_norm": 45.173611856138976, + "learning_rate": 4.991039879715898e-07, + "logits/chosen": 0.4289478361606598, + "logits/rejected": 0.46912992000579834, + "logps/chosen": -40.94606399536133, + "logps/rejected": -58.62925338745117, + "loss": 0.4057, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 1.023528814315796, + "rewards/margins": 1.251956582069397, + "rewards/rejected": -0.22842761874198914, + "step": 108 + }, + { + "epoch": 0.2538787023977433, + "grad_norm": 25.213187587591246, + "learning_rate": 4.989253201206736e-07, + "logits/chosen": 0.4647282361984253, + "logits/rejected": 0.4716295003890991, + "logps/chosen": -40.334922790527344, + "logps/rejected": -41.65603256225586, + "loss": 0.4339, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.9747889637947083, + "rewards/margins": 1.0528353452682495, + "rewards/rejected": -0.07804636657238007, + "step": 110 + }, + { + "epoch": 0.2538787023977433, + "eval_logits/chosen": 0.36145398020744324, + "eval_logits/rejected": 0.38587653636932373, + "eval_logps/chosen": -39.77558135986328, + "eval_logps/rejected": -48.301231384277344, + "eval_loss": 0.43463748693466187, + "eval_rewards/accuracies": 0.7379032373428345, + "eval_rewards/chosen": 0.9848493337631226, + "eval_rewards/margins": 1.1162999868392944, + "eval_rewards/rejected": -0.1314505934715271, + "eval_runtime": 220.4446, + "eval_samples_per_second": 7.866, + "eval_steps_per_second": 1.969, + "step": 110 + }, + { + "epoch": 0.258494678804975, + "grad_norm": 39.895524747857486, + "learning_rate": 4.987304583664712e-07, + "logits/chosen": 0.4972270429134369, + "logits/rejected": 0.5156663060188293, + "logps/chosen": -46.859134674072266, + "logps/rejected": -53.12602996826172, + "loss": 0.4463, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": 0.8810398578643799, + "rewards/margins": 0.9829990863800049, + "rewards/rejected": -0.10195919126272202, + "step": 112 + }, + { + "epoch": 0.26311065521220667, + "grad_norm": 36.88032065773003, + "learning_rate": 4.985194153857662e-07, + "logits/chosen": 0.4386284351348877, + "logits/rejected": 0.4557953476905823, + "logps/chosen": -36.74658203125, + "logps/rejected": -39.56464767456055, + "loss": 0.4788, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.9255303144454956, + "rewards/margins": 0.9156983494758606, + "rewards/rejected": 0.009831971488893032, + "step": 114 + }, + { + "epoch": 0.2677266316194384, + "grad_norm": 23.636821560598456, + "learning_rate": 4.982922049080163e-07, + "logits/chosen": 0.40630775690078735, + "logits/rejected": 0.4236665964126587, + "logps/chosen": -35.141971588134766, + "logps/rejected": -42.14583969116211, + "loss": 0.3872, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.8281899690628052, + "rewards/margins": 1.215153455734253, + "rewards/rejected": -0.386963427066803, + "step": 116 + }, + { + "epoch": 0.2723426080266701, + "grad_norm": 38.873691089935114, + "learning_rate": 4.980488417144599e-07, + "logits/chosen": 0.37884485721588135, + "logits/rejected": 0.4280329644680023, + "logps/chosen": -41.57583999633789, + "logps/rejected": -71.53160095214844, + "loss": 0.4818, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7444247603416443, + "rewards/margins": 1.1956461668014526, + "rewards/rejected": -0.4512213468551636, + "step": 118 + }, + { + "epoch": 0.27695858443390176, + "grad_norm": 27.126567445081033, + "learning_rate": 4.977893416371544e-07, + "logits/chosen": 0.4753592908382416, + "logits/rejected": 0.4997613728046417, + "logps/chosen": -34.07433319091797, + "logps/rejected": -45.33045959472656, + "loss": 0.3838, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.8865776062011719, + "rewards/margins": 1.4225349426269531, + "rewards/rejected": -0.5359571576118469, + "step": 120 + }, + { + "epoch": 0.27695858443390176, + "eval_logits/chosen": 0.3657575249671936, + "eval_logits/rejected": 0.39033937454223633, + "eval_logps/chosen": -39.95652770996094, + "eval_logps/rejected": -48.739437103271484, + "eval_loss": 0.410579651594162, + "eval_rewards/accuracies": 0.7540322542190552, + "eval_rewards/chosen": 0.8943750858306885, + "eval_rewards/margins": 1.2449262142181396, + "eval_rewards/rejected": -0.35055097937583923, + "eval_runtime": 220.2442, + "eval_samples_per_second": 7.873, + "eval_steps_per_second": 1.971, + "step": 120 + }, + { + "epoch": 0.28157456084113347, + "grad_norm": 26.815456443053705, + "learning_rate": 4.975137215579469e-07, + "logits/chosen": 0.5420396327972412, + "logits/rejected": 0.5500521659851074, + "logps/chosen": -45.788516998291016, + "logps/rejected": -42.21580505371094, + "loss": 0.4117, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.8019249439239502, + "rewards/margins": 1.2268595695495605, + "rewards/rejected": -0.42493465542793274, + "step": 122 + }, + { + "epoch": 0.2861905372483652, + "grad_norm": 30.749785890404876, + "learning_rate": 4.972219994073755e-07, + "logits/chosen": 0.49169254302978516, + "logits/rejected": 0.5404393672943115, + "logps/chosen": -38.644107818603516, + "logps/rejected": -67.01266479492188, + "loss": 0.3844, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8940033316612244, + "rewards/margins": 1.6317830085754395, + "rewards/rejected": -0.7377796173095703, + "step": 124 + }, + { + "epoch": 0.2908065136555969, + "grad_norm": 29.538977791375373, + "learning_rate": 4.969141941635025e-07, + "logits/chosen": 0.47598233819007874, + "logits/rejected": 0.5060492753982544, + "logps/chosen": -40.60331344604492, + "logps/rejected": -59.37862014770508, + "loss": 0.4476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5469496250152588, + "rewards/margins": 1.4448275566101074, + "rewards/rejected": -0.8978776931762695, + "step": 126 + }, + { + "epoch": 0.29542249006282856, + "grad_norm": 50.663011631161446, + "learning_rate": 4.965903258506806e-07, + "logits/chosen": 0.49228647351264954, + "logits/rejected": 0.5329996943473816, + "logps/chosen": -39.90941619873047, + "logps/rejected": -61.23884963989258, + "loss": 0.347, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.8301137685775757, + "rewards/margins": 1.5421695709228516, + "rewards/rejected": -0.7120558619499207, + "step": 128 + }, + { + "epoch": 0.30003846647006027, + "grad_norm": 32.98345370505989, + "learning_rate": 4.962504155382493e-07, + "logits/chosen": 0.4239842891693115, + "logits/rejected": 0.44136151671409607, + "logps/chosen": -36.07121276855469, + "logps/rejected": -41.06203079223633, + "loss": 0.3667, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.7973310351371765, + "rewards/margins": 1.2334084510803223, + "rewards/rejected": -0.4360772669315338, + "step": 130 + }, + { + "epoch": 0.30003846647006027, + "eval_logits/chosen": 0.3723231256008148, + "eval_logits/rejected": 0.3968786299228668, + "eval_logps/chosen": -39.925048828125, + "eval_logps/rejected": -48.9533576965332, + "eval_loss": 0.39173391461372375, + "eval_rewards/accuracies": 0.7753456234931946, + "eval_rewards/chosen": 0.9101160168647766, + "eval_rewards/margins": 1.3676302433013916, + "eval_rewards/rejected": -0.4575144052505493, + "eval_runtime": 220.267, + "eval_samples_per_second": 7.872, + "eval_steps_per_second": 1.97, + "step": 130 + }, + { + "epoch": 0.304654442877292, + "grad_norm": 28.392702162901728, + "learning_rate": 4.958944853391652e-07, + "logits/chosen": 0.520796537399292, + "logits/rejected": 0.5420558452606201, + "logps/chosen": -37.87763595581055, + "logps/rejected": -46.05318069458008, + "loss": 0.3819, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.932469367980957, + "rewards/margins": 1.2907413244247437, + "rewards/rejected": -0.3582719564437866, + "step": 132 + }, + { + "epoch": 0.30927041928452365, + "grad_norm": 27.83688192223066, + "learning_rate": 4.955225584085624e-07, + "logits/chosen": 0.42395105957984924, + "logits/rejected": 0.44882073998451233, + "logps/chosen": -36.98991775512695, + "logps/rejected": -51.79054260253906, + "loss": 0.3951, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": 0.9578195214271545, + "rewards/margins": 1.4403272867202759, + "rewards/rejected": -0.48250770568847656, + "step": 134 + }, + { + "epoch": 0.31388639569175536, + "grad_norm": 27.432482792006017, + "learning_rate": 4.951346589422467e-07, + "logits/chosen": 0.483965128660202, + "logits/rejected": 0.5153691172599792, + "logps/chosen": -37.48245620727539, + "logps/rejected": -54.50342559814453, + "loss": 0.3942, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 1.0384331941604614, + "rewards/margins": 1.5820738077163696, + "rewards/rejected": -0.5436408519744873, + "step": 136 + }, + { + "epoch": 0.3185023720989871, + "grad_norm": 46.62557646275611, + "learning_rate": 4.94730812175122e-07, + "logits/chosen": 0.43841731548309326, + "logits/rejected": 0.4499746561050415, + "logps/chosen": -38.93119812011719, + "logps/rejected": -42.26424026489258, + "loss": 0.4384, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.8961164951324463, + "rewards/margins": 1.247178554534912, + "rewards/rejected": -0.3510621190071106, + "step": 138 + }, + { + "epoch": 0.32311834850621873, + "grad_norm": 34.05743924648359, + "learning_rate": 4.943110443795476e-07, + "logits/chosen": 0.49757227301597595, + "logits/rejected": 0.5091323852539062, + "logps/chosen": -42.93407440185547, + "logps/rejected": -45.01084899902344, + "loss": 0.4061, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.8557752370834351, + "rewards/margins": 1.3424351215362549, + "rewards/rejected": -0.48665979504585266, + "step": 140 + }, + { + "epoch": 0.32311834850621873, + "eval_logits/chosen": 0.3763599395751953, + "eval_logits/rejected": 0.4011194705963135, + "eval_logps/chosen": -39.798316955566406, + "eval_logps/rejected": -49.11299133300781, + "eval_loss": 0.3788905441761017, + "eval_rewards/accuracies": 0.764976978302002, + "eval_rewards/chosen": 0.9734821915626526, + "eval_rewards/margins": 1.5108132362365723, + "eval_rewards/rejected": -0.5373309850692749, + "eval_runtime": 220.3233, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 140 + }, + { + "epoch": 0.32773432491345045, + "grad_norm": 36.481001944632766, + "learning_rate": 4.938753828636297e-07, + "logits/chosen": 0.4888935089111328, + "logits/rejected": 0.4963880777359009, + "logps/chosen": -46.02848815917969, + "logps/rejected": -44.94346237182617, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7767104506492615, + "rewards/margins": 1.235382080078125, + "rewards/rejected": -0.45867156982421875, + "step": 142 + }, + { + "epoch": 0.33235030132068216, + "grad_norm": 27.008693506029694, + "learning_rate": 4.934238559694447e-07, + "logits/chosen": 0.460690975189209, + "logits/rejected": 0.5057052969932556, + "logps/chosen": -38.473411560058594, + "logps/rejected": -54.91615295410156, + "loss": 0.3338, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.8698298335075378, + "rewards/margins": 1.6055673360824585, + "rewards/rejected": -0.7357374429702759, + "step": 144 + }, + { + "epoch": 0.3369662777279138, + "grad_norm": 32.261266015848825, + "learning_rate": 4.929564930711957e-07, + "logits/chosen": 0.4295574426651001, + "logits/rejected": 0.4522492587566376, + "logps/chosen": -39.829490661621094, + "logps/rejected": -44.733333587646484, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7346515655517578, + "rewards/margins": 1.3469676971435547, + "rewards/rejected": -0.6123161315917969, + "step": 146 + }, + { + "epoch": 0.34158225413514554, + "grad_norm": 28.797840444924386, + "learning_rate": 4.924733245733008e-07, + "logits/chosen": 0.5410254001617432, + "logits/rejected": 0.5485421419143677, + "logps/chosen": -43.81610870361328, + "logps/rejected": -40.52272033691406, + "loss": 0.3651, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": 0.9063374996185303, + "rewards/margins": 1.2729685306549072, + "rewards/rejected": -0.366630882024765, + "step": 148 + }, + { + "epoch": 0.34619823054237725, + "grad_norm": 30.202896827963542, + "learning_rate": 4.91974381908416e-07, + "logits/chosen": 0.42066994309425354, + "logits/rejected": 0.4589553475379944, + "logps/chosen": -38.81809997558594, + "logps/rejected": -58.59386444091797, + "loss": 0.3446, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.6800815463066101, + "rewards/margins": 1.928250789642334, + "rewards/rejected": -1.2481693029403687, + "step": 150 + }, + { + "epoch": 0.34619823054237725, + "eval_logits/chosen": 0.3821311295032501, + "eval_logits/rejected": 0.40684476494789124, + "eval_logps/chosen": -40.001861572265625, + "eval_logps/rejected": -49.4797477722168, + "eval_loss": 0.3633531332015991, + "eval_rewards/accuracies": 0.7724654674530029, + "eval_rewards/chosen": 0.8717083930969238, + "eval_rewards/margins": 1.5924171209335327, + "eval_rewards/rejected": -0.7207087278366089, + "eval_runtime": 220.1362, + "eval_samples_per_second": 7.877, + "eval_steps_per_second": 1.972, + "step": 150 + }, + { + "epoch": 0.3508142069496089, + "grad_norm": 26.385033894551455, + "learning_rate": 4.914596975353898e-07, + "logits/chosen": 0.4991176426410675, + "logits/rejected": 0.5242553353309631, + "logps/chosen": -38.974281311035156, + "logps/rejected": -48.54939270019531, + "loss": 0.3721, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.771294355392456, + "rewards/margins": 1.5243595838546753, + "rewards/rejected": -0.7530653476715088, + "step": 152 + }, + { + "epoch": 0.3554301833568406, + "grad_norm": 42.428423927932826, + "learning_rate": 4.909293049371519e-07, + "logits/chosen": 0.5288230180740356, + "logits/rejected": 0.5352779626846313, + "logps/chosen": -45.90478515625, + "logps/rejected": -44.53614044189453, + "loss": 0.3542, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.7464312314987183, + "rewards/margins": 1.5150079727172852, + "rewards/rejected": -0.7685766220092773, + "step": 154 + }, + { + "epoch": 0.36004615976407234, + "grad_norm": 36.75812549927479, + "learning_rate": 4.903832386185343e-07, + "logits/chosen": 0.47585126757621765, + "logits/rejected": 0.49040529131889343, + "logps/chosen": -44.172325134277344, + "logps/rejected": -43.98606872558594, + "loss": 0.3956, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": 0.5973650813102722, + "rewards/margins": 1.340658187866211, + "rewards/rejected": -0.743293046951294, + "step": 156 + }, + { + "epoch": 0.364662136171304, + "grad_norm": 26.152211217958236, + "learning_rate": 4.89821534104028e-07, + "logits/chosen": 0.39484938979148865, + "logits/rejected": 0.42477357387542725, + "logps/chosen": -41.93134307861328, + "logps/rejected": -56.39106750488281, + "loss": 0.3275, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.827555239200592, + "rewards/margins": 1.9599329233169556, + "rewards/rejected": -1.1323776245117188, + "step": 158 + }, + { + "epoch": 0.3692781125785357, + "grad_norm": 29.041350828980583, + "learning_rate": 4.892442279354698e-07, + "logits/chosen": 0.4744550287723541, + "logits/rejected": 0.5093830227851868, + "logps/chosen": -42.794578552246094, + "logps/rejected": -59.93064498901367, + "loss": 0.3605, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.540644645690918, + "rewards/margins": 1.6665728092193604, + "rewards/rejected": -1.125928282737732, + "step": 160 + }, + { + "epoch": 0.3692781125785357, + "eval_logits/chosen": 0.38920047879219055, + "eval_logits/rejected": 0.41388043761253357, + "eval_logps/chosen": -40.3745231628418, + "eval_logps/rejected": -49.94725036621094, + "eval_loss": 0.3510279059410095, + "eval_rewards/accuracies": 0.7920507192611694, + "eval_rewards/chosen": 0.6853779554367065, + "eval_rewards/margins": 1.6398398876190186, + "eval_rewards/rejected": -0.9544618129730225, + "eval_runtime": 220.1812, + "eval_samples_per_second": 7.875, + "eval_steps_per_second": 1.971, + "step": 160 + }, + { + "epoch": 0.3738940889857674, + "grad_norm": 32.36067481486556, + "learning_rate": 4.886513576696673e-07, + "logits/chosen": 0.4680570960044861, + "logits/rejected": 0.5030277371406555, + "logps/chosen": -42.39280700683594, + "logps/rejected": -58.18678283691406, + "loss": 0.392, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.7217347621917725, + "rewards/margins": 1.8615412712097168, + "rewards/rejected": -1.1398065090179443, + "step": 162 + }, + { + "epoch": 0.3785100653929991, + "grad_norm": 27.802667550507227, + "learning_rate": 4.880429618759543e-07, + "logits/chosen": 0.46893131732940674, + "logits/rejected": 0.4787411093711853, + "logps/chosen": -45.52459716796875, + "logps/rejected": -46.459312438964844, + "loss": 0.3819, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.870037317276001, + "rewards/margins": 1.4820109605789185, + "rewards/rejected": -0.6119736433029175, + "step": 164 + }, + { + "epoch": 0.3831260418002308, + "grad_norm": 27.278325528930967, + "learning_rate": 4.874190801336817e-07, + "logits/chosen": 0.46610963344573975, + "logits/rejected": 0.4872422218322754, + "logps/chosen": -44.28363037109375, + "logps/rejected": -51.54701232910156, + "loss": 0.323, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.6502636075019836, + "rewards/margins": 1.7216179370880127, + "rewards/rejected": -1.0713541507720947, + "step": 166 + }, + { + "epoch": 0.3877420182074625, + "grad_norm": 25.09454062223173, + "learning_rate": 4.867797530296431e-07, + "logits/chosen": 0.4582709074020386, + "logits/rejected": 0.48244646191596985, + "logps/chosen": -45.76988983154297, + "logps/rejected": -55.2458610534668, + "loss": 0.2842, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.6319215297698975, + "rewards/margins": 2.007154941558838, + "rewards/rejected": -1.3752332925796509, + "step": 168 + }, + { + "epoch": 0.39235799461469417, + "grad_norm": 25.014228656107395, + "learning_rate": 4.861250221554343e-07, + "logits/chosen": 0.4760267436504364, + "logits/rejected": 0.5161222219467163, + "logps/chosen": -36.09988021850586, + "logps/rejected": -58.49198913574219, + "loss": 0.317, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.6937295794487, + "rewards/margins": 2.0070507526397705, + "rewards/rejected": -1.3133213520050049, + "step": 170 + }, + { + "epoch": 0.39235799461469417, + "eval_logits/chosen": 0.39130648970603943, + "eval_logits/rejected": 0.41622862219810486, + "eval_logps/chosen": -40.4833984375, + "eval_logps/rejected": -50.18775177001953, + "eval_loss": 0.343056857585907, + "eval_rewards/accuracies": 0.796658992767334, + "eval_rewards/chosen": 0.6309407949447632, + "eval_rewards/margins": 1.7056493759155273, + "eval_rewards/rejected": -1.0747085809707642, + "eval_runtime": 220.3261, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 170 + }, + { + "epoch": 0.3969739710219259, + "grad_norm": 21.660777806253456, + "learning_rate": 4.854549301047476e-07, + "logits/chosen": 0.5408195853233337, + "logits/rejected": 0.5565234422683716, + "logps/chosen": -42.90623474121094, + "logps/rejected": -43.590702056884766, + "loss": 0.373, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.6533936262130737, + "rewards/margins": 1.5095248222351074, + "rewards/rejected": -0.8561312556266785, + "step": 172 + }, + { + "epoch": 0.4015899474291576, + "grad_norm": 32.27746768838142, + "learning_rate": 4.847695204706005e-07, + "logits/chosen": 0.47649839520454407, + "logits/rejected": 0.49190616607666016, + "logps/chosen": -38.49553680419922, + "logps/rejected": -40.65150451660156, + "loss": 0.3558, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.7918031811714172, + "rewards/margins": 1.4087355136871338, + "rewards/rejected": -0.6169323325157166, + "step": 174 + }, + { + "epoch": 0.40620592383638926, + "grad_norm": 31.844706703711985, + "learning_rate": 4.840688378425e-07, + "logits/chosen": 0.5188453793525696, + "logits/rejected": 0.5562708973884583, + "logps/chosen": -46.135372161865234, + "logps/rejected": -56.292930603027344, + "loss": 0.261, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.7925480604171753, + "rewards/margins": 2.1678171157836914, + "rewards/rejected": -1.3752690553665161, + "step": 176 + }, + { + "epoch": 0.410821900243621, + "grad_norm": 26.376171346573187, + "learning_rate": 4.833529278035422e-07, + "logits/chosen": 0.357127845287323, + "logits/rejected": 0.4103134572505951, + "logps/chosen": -37.78556442260742, + "logps/rejected": -67.52072143554688, + "loss": 0.2899, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.9015005826950073, + "rewards/margins": 2.719820261001587, + "rewards/rejected": -1.81831955909729, + "step": 178 + }, + { + "epoch": 0.4154378766508527, + "grad_norm": 26.0393680431703, + "learning_rate": 4.826218369274459e-07, + "logits/chosen": 0.4666251540184021, + "logits/rejected": 0.5160384178161621, + "logps/chosen": -39.356258392333984, + "logps/rejected": -62.83391571044922, + "loss": 0.3066, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.8675535917282104, + "rewards/margins": 2.234145164489746, + "rewards/rejected": -1.3665915727615356, + "step": 180 + }, + { + "epoch": 0.4154378766508527, + "eval_logits/chosen": 0.3935144245624542, + "eval_logits/rejected": 0.41844189167022705, + "eval_logps/chosen": -39.861793518066406, + "eval_logps/rejected": -49.855037689208984, + "eval_loss": 0.3321295380592346, + "eval_rewards/accuracies": 0.7926267385482788, + "eval_rewards/chosen": 0.941743791103363, + "eval_rewards/margins": 1.8500969409942627, + "eval_rewards/rejected": -0.9083530902862549, + "eval_runtime": 220.3176, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 180 + }, + { + "epoch": 0.42005385305808435, + "grad_norm": 23.061889635448846, + "learning_rate": 4.818756127755237e-07, + "logits/chosen": 0.49034425616264343, + "logits/rejected": 0.5069853663444519, + "logps/chosen": -37.846553802490234, + "logps/rejected": -41.30693817138672, + "loss": 0.2693, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 1.0121811628341675, + "rewards/margins": 1.8432265520095825, + "rewards/rejected": -0.831045389175415, + "step": 182 + }, + { + "epoch": 0.42466982946531606, + "grad_norm": 22.17904586209137, + "learning_rate": 4.811143038935873e-07, + "logits/chosen": 0.5580455660820007, + "logits/rejected": 0.5748550295829773, + "logps/chosen": -42.32413101196289, + "logps/rejected": -46.0750732421875, + "loss": 0.3264, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 1.0455000400543213, + "rewards/margins": 1.93173086643219, + "rewards/rejected": -0.8862307667732239, + "step": 184 + }, + { + "epoch": 0.4292858058725478, + "grad_norm": 30.29917055573095, + "learning_rate": 4.803379598087899e-07, + "logits/chosen": 0.5174715518951416, + "logits/rejected": 0.5311744213104248, + "logps/chosen": -40.50711441040039, + "logps/rejected": -40.298824310302734, + "loss": 0.316, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.9772326350212097, + "rewards/margins": 1.7723863124847412, + "rewards/rejected": -0.795153796672821, + "step": 186 + }, + { + "epoch": 0.43390178227977944, + "grad_norm": 40.38001852713412, + "learning_rate": 4.795466310264034e-07, + "logits/chosen": 0.42736437916755676, + "logits/rejected": 0.463912695646286, + "logps/chosen": -39.35895919799805, + "logps/rejected": -64.93545532226562, + "loss": 0.4185, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": 0.5966134667396545, + "rewards/margins": 1.980704665184021, + "rewards/rejected": -1.3840913772583008, + "step": 188 + }, + { + "epoch": 0.43851775868701115, + "grad_norm": 17.949323810733784, + "learning_rate": 4.787403690265335e-07, + "logits/chosen": 0.5044853091239929, + "logits/rejected": 0.5284148454666138, + "logps/chosen": -39.47854995727539, + "logps/rejected": -49.92608642578125, + "loss": 0.3266, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 1.0101630687713623, + "rewards/margins": 1.9091652631759644, + "rewards/rejected": -0.8990020751953125, + "step": 190 + }, + { + "epoch": 0.43851775868701115, + "eval_logits/chosen": 0.3972060978412628, + "eval_logits/rejected": 0.4221220314502716, + "eval_logps/chosen": -39.831119537353516, + "eval_logps/rejected": -50.09023666381836, + "eval_loss": 0.3243154287338257, + "eval_rewards/accuracies": 0.7914746403694153, + "eval_rewards/chosen": 0.9570826292037964, + "eval_rewards/margins": 1.9830337762832642, + "eval_rewards/rejected": -1.0259510278701782, + "eval_runtime": 220.3237, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 190 + }, + { + "epoch": 0.44313373509424286, + "grad_norm": 36.065620072852695, + "learning_rate": 4.779192262607702e-07, + "logits/chosen": 0.5138534903526306, + "logits/rejected": 0.544155478477478, + "logps/chosen": -43.310760498046875, + "logps/rejected": -59.56623840332031, + "loss": 0.3542, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.9537274837493896, + "rewards/margins": 2.111888885498047, + "rewards/rejected": -1.1581614017486572, + "step": 192 + }, + { + "epoch": 0.4477497115014745, + "grad_norm": 24.653058016123207, + "learning_rate": 4.770832561487758e-07, + "logits/chosen": 0.4504295885562897, + "logits/rejected": 0.46597781777381897, + "logps/chosen": -41.51498794555664, + "logps/rejected": -43.07120132446289, + "loss": 0.2587, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.9096213579177856, + "rewards/margins": 2.131375551223755, + "rewards/rejected": -1.2217543125152588, + "step": 194 + }, + { + "epoch": 0.45236568790870624, + "grad_norm": 36.95305184003922, + "learning_rate": 4.762325130748097e-07, + "logits/chosen": 0.5585076808929443, + "logits/rejected": 0.5717556476593018, + "logps/chosen": -47.50046920776367, + "logps/rejected": -44.811973571777344, + "loss": 0.3412, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.9956084489822388, + "rewards/margins": 1.8879083395004272, + "rewards/rejected": -0.8922999501228333, + "step": 196 + }, + { + "epoch": 0.45698166431593795, + "grad_norm": 16.999205852011567, + "learning_rate": 4.7536705238418995e-07, + "logits/chosen": 0.47373294830322266, + "logits/rejected": 0.49137142300605774, + "logps/chosen": -42.69048309326172, + "logps/rejected": -50.26279067993164, + "loss": 0.275, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.8595349788665771, + "rewards/margins": 2.2136645317077637, + "rewards/rejected": -1.3541297912597656, + "step": 198 + }, + { + "epoch": 0.4615976407231696, + "grad_norm": 33.06750404898565, + "learning_rate": 4.7448693037969336e-07, + "logits/chosen": 0.5136507749557495, + "logits/rejected": 0.527184247970581, + "logps/chosen": -41.794132232666016, + "logps/rejected": -48.2490119934082, + "loss": 0.2986, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.8771340847015381, + "rewards/margins": 1.9797013998031616, + "rewards/rejected": -1.102567195892334, + "step": 200 + }, + { + "epoch": 0.4615976407231696, + "eval_logits/chosen": 0.39842745661735535, + "eval_logits/rejected": 0.42355066537857056, + "eval_logps/chosen": -40.12582778930664, + "eval_logps/rejected": -50.502620697021484, + "eval_loss": 0.3160472810268402, + "eval_rewards/accuracies": 0.7978110313415527, + "eval_rewards/chosen": 0.8097268342971802, + "eval_rewards/margins": 2.041868209838867, + "eval_rewards/rejected": -1.2321414947509766, + "eval_runtime": 220.4769, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.968, + "step": 200 + }, + { + "epoch": 0.4662136171304013, + "grad_norm": 27.6688088244827, + "learning_rate": 4.735922043178923e-07, + "logits/chosen": 0.5529847741127014, + "logits/rejected": 0.5818406939506531, + "logps/chosen": -42.29270553588867, + "logps/rejected": -57.84202575683594, + "loss": 0.2725, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.8104487061500549, + "rewards/margins": 2.3321969509124756, + "rewards/rejected": -1.521748423576355, + "step": 202 + }, + { + "epoch": 0.47082959353763304, + "grad_norm": 23.404484719369563, + "learning_rate": 4.7268293240543017e-07, + "logits/chosen": 0.48225533962249756, + "logits/rejected": 0.5109025239944458, + "logps/chosen": -40.953433990478516, + "logps/rejected": -55.026153564453125, + "loss": 0.3435, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.8147386908531189, + "rewards/margins": 2.057671546936035, + "rewards/rejected": -1.2429331541061401, + "step": 204 + }, + { + "epoch": 0.4754455699448647, + "grad_norm": 29.663210206611154, + "learning_rate": 4.717591737952344e-07, + "logits/chosen": 0.48208919167518616, + "logits/rejected": 0.517291247844696, + "logps/chosen": -36.30723190307617, + "logps/rejected": -54.3764533996582, + "loss": 0.3135, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8137081861495972, + "rewards/margins": 2.101260185241699, + "rewards/rejected": -1.287551999092102, + "step": 206 + }, + { + "epoch": 0.4800615463520964, + "grad_norm": 29.39474251364716, + "learning_rate": 4.7082098858266837e-07, + "logits/chosen": 0.48040205240249634, + "logits/rejected": 0.5284512042999268, + "logps/chosen": -31.84227180480957, + "logps/rejected": -61.47830581665039, + "loss": 0.3821, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": 0.455925315618515, + "rewards/margins": 2.105367422103882, + "rewards/rejected": -1.649442195892334, + "step": 208 + }, + { + "epoch": 0.4846775227593281, + "grad_norm": 15.879511628269139, + "learning_rate": 4.698684378016222e-07, + "logits/chosen": 0.4825616478919983, + "logits/rejected": 0.5131646394729614, + "logps/chosen": -43.97586441040039, + "logps/rejected": -58.62031936645508, + "loss": 0.271, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.6879336833953857, + "rewards/margins": 2.212796211242676, + "rewards/rejected": -1.5248624086380005, + "step": 210 + }, + { + "epoch": 0.4846775227593281, + "eval_logits/chosen": 0.40579578280448914, + "eval_logits/rejected": 0.43089571595191956, + "eval_logps/chosen": -40.55123519897461, + "eval_logps/rejected": -51.04283905029297, + "eval_loss": 0.3111670911312103, + "eval_rewards/accuracies": 0.804147481918335, + "eval_rewards/chosen": 0.597020149230957, + "eval_rewards/margins": 2.099271535873413, + "eval_rewards/rejected": -1.502251386642456, + "eval_runtime": 220.3759, + "eval_samples_per_second": 7.868, + "eval_steps_per_second": 1.969, + "step": 210 + }, + { + "epoch": 0.48929349916655984, + "grad_norm": 33.220388342252136, + "learning_rate": 4.6890158342054174e-07, + "logits/chosen": 0.46122825145721436, + "logits/rejected": 0.48773014545440674, + "logps/chosen": -38.094722747802734, + "logps/rejected": -50.649871826171875, + "loss": 0.3288, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.5131194591522217, + "rewards/margins": 2.1884312629699707, + "rewards/rejected": -1.6753116846084595, + "step": 212 + }, + { + "epoch": 0.4939094755737915, + "grad_norm": 27.37607169791161, + "learning_rate": 4.679204883383973e-07, + "logits/chosen": 0.45677465200424194, + "logits/rejected": 0.5006839632987976, + "logps/chosen": -36.343292236328125, + "logps/rejected": -65.76275634765625, + "loss": 0.301, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.5972538590431213, + "rewards/margins": 2.6644716262817383, + "rewards/rejected": -2.0672178268432617, + "step": 214 + }, + { + "epoch": 0.4985254519810232, + "grad_norm": 28.712191033509406, + "learning_rate": 4.669252163805919e-07, + "logits/chosen": 0.48203393816947937, + "logits/rejected": 0.5129568576812744, + "logps/chosen": -40.263328552246094, + "logps/rejected": -53.96393966674805, + "loss": 0.3434, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.3674449920654297, + "rewards/margins": 2.094463348388672, + "rewards/rejected": -1.7270184755325317, + "step": 216 + }, + { + "epoch": 0.5031414283882549, + "grad_norm": 21.430060439194165, + "learning_rate": 4.65915832294809e-07, + "logits/chosen": 0.5647565722465515, + "logits/rejected": 0.6052375435829163, + "logps/chosen": -37.24385070800781, + "logps/rejected": -58.28202438354492, + "loss": 0.2945, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.5437911748886108, + "rewards/margins": 2.518171787261963, + "rewards/rejected": -1.9743802547454834, + "step": 218 + }, + { + "epoch": 0.5077574047954866, + "grad_norm": 24.194015322932014, + "learning_rate": 4.6489240174680026e-07, + "logits/chosen": 0.5365298390388489, + "logits/rejected": 0.5451048612594604, + "logps/chosen": -40.26055145263672, + "logps/rejected": -40.11984634399414, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4349411725997925, + "rewards/margins": 1.4253244400024414, + "rewards/rejected": -0.9903832674026489, + "step": 220 + }, + { + "epoch": 0.5077574047954866, + "eval_logits/chosen": 0.40611767768859863, + "eval_logits/rejected": 0.43133166432380676, + "eval_logps/chosen": -40.628150939941406, + "eval_logps/rejected": -51.22826385498047, + "eval_loss": 0.30713996291160583, + "eval_rewards/accuracies": 0.8018433451652527, + "eval_rewards/chosen": 0.5585668087005615, + "eval_rewards/margins": 2.153529644012451, + "eval_rewards/rejected": -1.5949628353118896, + "eval_runtime": 220.3416, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 220 + }, + { + "epoch": 0.5123733812027182, + "grad_norm": 23.39715012730976, + "learning_rate": 4.638549913161138e-07, + "logits/chosen": 0.5600088834762573, + "logits/rejected": 0.5736495852470398, + "logps/chosen": -46.20627212524414, + "logps/rejected": -47.1099739074707, + "loss": 0.2227, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.7162383794784546, + "rewards/margins": 2.4795196056365967, + "rewards/rejected": -1.763281226158142, + "step": 222 + }, + { + "epoch": 0.51698935760995, + "grad_norm": 23.70013936518676, + "learning_rate": 4.6280366849176267e-07, + "logits/chosen": 0.553576648235321, + "logits/rejected": 0.5800661444664001, + "logps/chosen": -41.73429870605469, + "logps/rejected": -47.09934997558594, + "loss": 0.2708, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6174063682556152, + "rewards/margins": 2.10538649559021, + "rewards/rejected": -1.4879801273345947, + "step": 224 + }, + { + "epoch": 0.5216053340171817, + "grad_norm": 19.39438827436505, + "learning_rate": 4.6173850166783446e-07, + "logits/chosen": 0.5699052810668945, + "logits/rejected": 0.5908712148666382, + "logps/chosen": -40.74462127685547, + "logps/rejected": -53.7403450012207, + "loss": 0.2716, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.5502187609672546, + "rewards/margins": 2.0002176761627197, + "rewards/rejected": -1.4499988555908203, + "step": 226 + }, + { + "epoch": 0.5262213104244133, + "grad_norm": 24.49934372199594, + "learning_rate": 4.606595601390417e-07, + "logits/chosen": 0.46904435753822327, + "logits/rejected": 0.5106580257415771, + "logps/chosen": -39.85272979736328, + "logps/rejected": -61.70741653442383, + "loss": 0.2336, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3319948613643646, + "rewards/margins": 2.6446897983551025, + "rewards/rejected": -2.312695026397705, + "step": 228 + }, + { + "epoch": 0.5308372868316451, + "grad_norm": 28.165420212664795, + "learning_rate": 4.595669140962143e-07, + "logits/chosen": 0.4127655625343323, + "logits/rejected": 0.479299396276474, + "logps/chosen": -34.939422607421875, + "logps/rejected": -78.63516235351562, + "loss": 0.3107, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.18619215488433838, + "rewards/margins": 2.8822548389434814, + "rewards/rejected": -2.6960630416870117, + "step": 230 + }, + { + "epoch": 0.5308372868316451, + "eval_logits/chosen": 0.4082220494747162, + "eval_logits/rejected": 0.4335884749889374, + "eval_logps/chosen": -40.824676513671875, + "eval_logps/rejected": -51.529090881347656, + "eval_loss": 0.30161648988723755, + "eval_rewards/accuracies": 0.8104838728904724, + "eval_rewards/chosen": 0.4603023827075958, + "eval_rewards/margins": 2.205678939819336, + "eval_rewards/rejected": -1.745376706123352, + "eval_runtime": 220.269, + "eval_samples_per_second": 7.872, + "eval_steps_per_second": 1.97, + "step": 230 + }, + { + "epoch": 0.5354532632388768, + "grad_norm": 16.564014282307436, + "learning_rate": 4.5846063462173284e-07, + "logits/chosen": 0.5141347050666809, + "logits/rejected": 0.5398997664451599, + "logps/chosen": -38.93478012084961, + "logps/rejected": -53.1637077331543, + "loss": 0.2932, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.3137105405330658, + "rewards/margins": 2.214162826538086, + "rewards/rejected": -1.9004522562026978, + "step": 232 + }, + { + "epoch": 0.5400692396461084, + "grad_norm": 30.180896923031582, + "learning_rate": 4.573407936849044e-07, + "logits/chosen": 0.49748367071151733, + "logits/rejected": 0.502750039100647, + "logps/chosen": -46.67736053466797, + "logps/rejected": -48.594566345214844, + "loss": 0.3143, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.39324572682380676, + "rewards/margins": 1.9298076629638672, + "rewards/rejected": -1.5365619659423828, + "step": 234 + }, + { + "epoch": 0.5446852160533402, + "grad_norm": 43.03719615392396, + "learning_rate": 4.5620746413728063e-07, + "logits/chosen": 0.5845724940299988, + "logits/rejected": 0.5915371775627136, + "logps/chosen": -52.0160026550293, + "logps/rejected": -49.12672805786133, + "loss": 0.2833, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.14373371005058289, + "rewards/margins": 2.1639184951782227, + "rewards/rejected": -2.0201845169067383, + "step": 236 + }, + { + "epoch": 0.5493011924605719, + "grad_norm": 21.1030283537707, + "learning_rate": 4.550607197079185e-07, + "logits/chosen": 0.552834153175354, + "logits/rejected": 0.5818264484405518, + "logps/chosen": -38.04405212402344, + "logps/rejected": -46.87253189086914, + "loss": 0.2897, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.4428212344646454, + "rewards/margins": 1.7602063417434692, + "rewards/rejected": -1.317385196685791, + "step": 238 + }, + { + "epoch": 0.5539171688678035, + "grad_norm": 14.340136381864786, + "learning_rate": 4.5390063499858353e-07, + "logits/chosen": 0.5454181432723999, + "logits/rejected": 0.5769542455673218, + "logps/chosen": -47.16811752319336, + "logps/rejected": -62.15293884277344, + "loss": 0.2046, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.36544325947761536, + "rewards/margins": 2.6488418579101562, + "rewards/rejected": -2.2833986282348633, + "step": 240 + }, + { + "epoch": 0.5539171688678035, + "eval_logits/chosen": 0.41252779960632324, + "eval_logits/rejected": 0.4378991425037384, + "eval_logps/chosen": -40.974342346191406, + "eval_logps/rejected": -51.8930778503418, + "eval_loss": 0.2962896525859833, + "eval_rewards/accuracies": 0.8070276379585266, + "eval_rewards/chosen": 0.3854685127735138, + "eval_rewards/margins": 2.312840223312378, + "eval_rewards/rejected": -1.927371859550476, + "eval_runtime": 220.4271, + "eval_samples_per_second": 7.867, + "eval_steps_per_second": 1.969, + "step": 240 + }, + { + "epoch": 0.5585331452750353, + "grad_norm": 20.121912107871452, + "learning_rate": 4.5272728547889687e-07, + "logits/chosen": 0.5017317533493042, + "logits/rejected": 0.5252359509468079, + "logps/chosen": -43.418678283691406, + "logps/rejected": -51.78999710083008, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29254353046417236, + "rewards/margins": 2.571570873260498, + "rewards/rejected": -2.2790274620056152, + "step": 242 + }, + { + "epoch": 0.5631491216822669, + "grad_norm": 36.79556689673262, + "learning_rate": 4.5154074748142535e-07, + "logits/chosen": 0.5326908230781555, + "logits/rejected": 0.5592876672744751, + "logps/chosen": -45.176578521728516, + "logps/rejected": -55.26374053955078, + "loss": 0.2959, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.26444554328918457, + "rewards/margins": 2.2498509883880615, + "rewards/rejected": -1.985405445098877, + "step": 244 + }, + { + "epoch": 0.5677650980894986, + "grad_norm": 30.279268688467162, + "learning_rate": 4.503410981967158e-07, + "logits/chosen": 0.508591890335083, + "logits/rejected": 0.5472189784049988, + "logps/chosen": -37.81255340576172, + "logps/rejected": -59.81355285644531, + "loss": 0.3387, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.324074387550354, + "rewards/margins": 2.479010581970215, + "rewards/rejected": -2.1549363136291504, + "step": 246 + }, + { + "epoch": 0.5723810744967304, + "grad_norm": 32.696656835575155, + "learning_rate": 4.4912841566827333e-07, + "logits/chosen": 0.5358154773712158, + "logits/rejected": 0.572979211807251, + "logps/chosen": -40.84016799926758, + "logps/rejected": -57.57326889038086, + "loss": 0.2559, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.6461736559867859, + "rewards/margins": 2.717188835144043, + "rewards/rejected": -2.0710153579711914, + "step": 248 + }, + { + "epoch": 0.576997050903962, + "grad_norm": 26.864795137183627, + "learning_rate": 4.4790277878748415e-07, + "logits/chosen": 0.5129296779632568, + "logits/rejected": 0.543644368648529, + "logps/chosen": -36.90694046020508, + "logps/rejected": -51.41253662109375, + "loss": 0.2466, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.4497109651565552, + "rewards/margins": 2.559537172317505, + "rewards/rejected": -2.1098265647888184, + "step": 250 + }, + { + "epoch": 0.576997050903962, + "eval_logits/chosen": 0.4140053689479828, + "eval_logits/rejected": 0.43953680992126465, + "eval_logps/chosen": -40.92128372192383, + "eval_logps/rejected": -52.06728744506836, + "eval_loss": 0.29202744364738464, + "eval_rewards/accuracies": 0.8064516186714172, + "eval_rewards/chosen": 0.41199636459350586, + "eval_rewards/margins": 2.4264743328094482, + "eval_rewards/rejected": -2.0144779682159424, + "eval_runtime": 220.3958, + "eval_samples_per_second": 7.868, + "eval_steps_per_second": 1.969, + "step": 250 + }, + { + "epoch": 0.5816130273111938, + "grad_norm": 34.34355868179491, + "learning_rate": 4.466642672884835e-07, + "logits/chosen": 0.5273095965385437, + "logits/rejected": 0.5604310631752014, + "logps/chosen": -39.039512634277344, + "logps/rejected": -52.470951080322266, + "loss": 0.2676, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.256040096282959, + "rewards/margins": 2.4306235313415527, + "rewards/rejected": -2.1745834350585938, + "step": 252 + }, + { + "epoch": 0.5862290037184255, + "grad_norm": 27.545044099293104, + "learning_rate": 4.454129617429682e-07, + "logits/chosen": 0.515310525894165, + "logits/rejected": 0.5264334678649902, + "logps/chosen": -41.25297546386719, + "logps/rejected": -44.831031799316406, + "loss": 0.2921, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.2963744103908539, + "rewards/margins": 2.2201662063598633, + "rewards/rejected": -1.9237921237945557, + "step": 254 + }, + { + "epoch": 0.5908449801256571, + "grad_norm": 16.22258168997157, + "learning_rate": 4.441489435549551e-07, + "logits/chosen": 0.5497354865074158, + "logits/rejected": 0.5820472240447998, + "logps/chosen": -45.16104507446289, + "logps/rejected": -60.09016799926758, + "loss": 0.2492, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.36222705245018005, + "rewards/margins": 2.6290435791015625, + "rewards/rejected": -2.2668166160583496, + "step": 256 + }, + { + "epoch": 0.5954609565328889, + "grad_norm": 22.519317936372268, + "learning_rate": 4.4287229495548573e-07, + "logits/chosen": 0.5290111303329468, + "logits/rejected": 0.550987184047699, + "logps/chosen": -45.896942138671875, + "logps/rejected": -57.38431930541992, + "loss": 0.2158, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.3132680654525757, + "rewards/margins": 2.935549020767212, + "rewards/rejected": -2.622281074523926, + "step": 258 + }, + { + "epoch": 0.6000769329401205, + "grad_norm": 33.27879387908239, + "learning_rate": 4.415830989972761e-07, + "logits/chosen": 0.613827645778656, + "logits/rejected": 0.6395273208618164, + "logps/chosen": -40.98984146118164, + "logps/rejected": -48.8809700012207, + "loss": 0.3209, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.3634183704853058, + "rewards/margins": 2.285569190979004, + "rewards/rejected": -1.922150731086731, + "step": 260 + }, + { + "epoch": 0.6000769329401205, + "eval_logits/chosen": 0.41586774587631226, + "eval_logits/rejected": 0.4413994550704956, + "eval_logps/chosen": -41.435340881347656, + "eval_logps/rejected": -52.66230773925781, + "eval_loss": 0.28806936740875244, + "eval_rewards/accuracies": 0.8116359710693359, + "eval_rewards/chosen": 0.15496963262557983, + "eval_rewards/margins": 2.4669582843780518, + "eval_rewards/rejected": -2.3119888305664062, + "eval_runtime": 220.1153, + "eval_samples_per_second": 7.878, + "eval_steps_per_second": 1.972, + "step": 260 + }, + { + "epoch": 0.6046929093473522, + "grad_norm": 28.090703957454657, + "learning_rate": 4.402814395493142e-07, + "logits/chosen": 0.49612462520599365, + "logits/rejected": 0.4979320168495178, + "logps/chosen": -40.7058219909668, + "logps/rejected": -38.908050537109375, + "loss": 0.3653, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.15811699628829956, + "rewards/margins": 1.8890395164489746, + "rewards/rejected": -1.7309226989746094, + "step": 262 + }, + { + "epoch": 0.609308885754584, + "grad_norm": 20.963207734816056, + "learning_rate": 4.3896740129140354e-07, + "logits/chosen": 0.49926820397377014, + "logits/rejected": 0.518930196762085, + "logps/chosen": -41.947425842285156, + "logps/rejected": -42.273597717285156, + "loss": 0.2493, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.2666120231151581, + "rewards/margins": 2.4279704093933105, + "rewards/rejected": -2.161358594894409, + "step": 264 + }, + { + "epoch": 0.6139248621618156, + "grad_norm": 24.847993356607933, + "learning_rate": 4.3764106970865456e-07, + "logits/chosen": 0.5007407665252686, + "logits/rejected": 0.5330516695976257, + "logps/chosen": -36.07570266723633, + "logps/rejected": -50.92935562133789, + "loss": 0.3174, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.02925288677215576, + "rewards/margins": 2.231614589691162, + "rewards/rejected": -2.202361583709717, + "step": 266 + }, + { + "epoch": 0.6185408385690473, + "grad_norm": 26.539349634561272, + "learning_rate": 4.3630253108592305e-07, + "logits/chosen": 0.5235443115234375, + "logits/rejected": 0.5463228821754456, + "logps/chosen": -48.52283477783203, + "logps/rejected": -54.78059387207031, + "loss": 0.2266, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.005189484916627407, + "rewards/margins": 2.9114773273468018, + "rewards/rejected": -2.9062881469726562, + "step": 268 + }, + { + "epoch": 0.6231568149762791, + "grad_norm": 35.3397663590889, + "learning_rate": 4.3495187250219723e-07, + "logits/chosen": 0.4959086775779724, + "logits/rejected": 0.5330989360809326, + "logps/chosen": -37.50285339355469, + "logps/rejected": -56.99623489379883, + "loss": 0.2865, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.16485626995563507, + "rewards/margins": 2.9254465103149414, + "rewards/rejected": -3.0903029441833496, + "step": 270 + }, + { + "epoch": 0.6231568149762791, + "eval_logits/chosen": 0.4182251989841461, + "eval_logits/rejected": 0.44391536712646484, + "eval_logps/chosen": -41.51067352294922, + "eval_logps/rejected": -52.77988052368164, + "eval_loss": 0.2869359254837036, + "eval_rewards/accuracies": 0.8116359710693359, + "eval_rewards/chosen": 0.11730305105447769, + "eval_rewards/margins": 2.488077163696289, + "eval_rewards/rejected": -2.3707735538482666, + "eval_runtime": 220.1579, + "eval_samples_per_second": 7.876, + "eval_steps_per_second": 1.971, + "step": 270 + }, + { + "epoch": 0.6277727913835107, + "grad_norm": 23.403340630174217, + "learning_rate": 4.3358918182493253e-07, + "logits/chosen": 0.5670427083969116, + "logits/rejected": 0.5846278071403503, + "logps/chosen": -41.197166442871094, + "logps/rejected": -48.75783920288086, + "loss": 0.229, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.05103777348995209, + "rewards/margins": 2.2875313758850098, + "rewards/rejected": -2.338569164276123, + "step": 272 + }, + { + "epoch": 0.6323887677907424, + "grad_norm": 31.35543837574939, + "learning_rate": 4.3221454770433554e-07, + "logits/chosen": 0.5044899582862854, + "logits/rejected": 0.5252879858016968, + "logps/chosen": -46.43470764160156, + "logps/rejected": -50.872764587402344, + "loss": 0.2558, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.030280061066150665, + "rewards/margins": 2.529269218444824, + "rewards/rejected": -2.4989893436431885, + "step": 274 + }, + { + "epoch": 0.6370047441979741, + "grad_norm": 27.239886684790495, + "learning_rate": 4.308280595675966e-07, + "logits/chosen": 0.5399680733680725, + "logits/rejected": 0.5539530515670776, + "logps/chosen": -45.22441101074219, + "logps/rejected": -51.61985397338867, + "loss": 0.3439, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -0.1256939023733139, + "rewards/margins": 2.2664339542388916, + "rewards/rejected": -2.392127752304077, + "step": 276 + }, + { + "epoch": 0.6416207206052058, + "grad_norm": 29.254953852014435, + "learning_rate": 4.2942980761307227e-07, + "logits/chosen": 0.5513600707054138, + "logits/rejected": 0.5763798356056213, + "logps/chosen": -42.95576477050781, + "logps/rejected": -53.852542877197266, + "loss": 0.2795, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.3323157727718353, + "rewards/margins": 2.3478498458862305, + "rewards/rejected": -2.680166006088257, + "step": 278 + }, + { + "epoch": 0.6462366970124375, + "grad_norm": 16.01715280590405, + "learning_rate": 4.2801988280441765e-07, + "logits/chosen": 0.5487841367721558, + "logits/rejected": 0.5692893862724304, + "logps/chosen": -45.817508697509766, + "logps/rejected": -54.61252975463867, + "loss": 0.2162, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.03073420189321041, + "rewards/margins": 2.809882402420044, + "rewards/rejected": -2.840616226196289, + "step": 280 + }, + { + "epoch": 0.6462366970124375, + "eval_logits/chosen": 0.41910773515701294, + "eval_logits/rejected": 0.44490164518356323, + "eval_logps/chosen": -41.43645477294922, + "eval_logps/rejected": -52.90102005004883, + "eval_loss": 0.2802717387676239, + "eval_rewards/accuracies": 0.8104838728904724, + "eval_rewards/chosen": 0.15440984070301056, + "eval_rewards/margins": 2.585754156112671, + "eval_rewards/rejected": -2.431344509124756, + "eval_runtime": 220.3099, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 280 + }, + { + "epoch": 0.6508526734196692, + "grad_norm": 21.181113416054586, + "learning_rate": 4.2659837686466813e-07, + "logits/chosen": 0.498602032661438, + "logits/rejected": 0.5217832922935486, + "logps/chosen": -40.613285064697266, + "logps/rejected": -50.06806945800781, + "loss": 0.262, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.07628664374351501, + "rewards/margins": 2.542593240737915, + "rewards/rejected": -2.466306447982788, + "step": 282 + }, + { + "epoch": 0.6554686498269009, + "grad_norm": 27.465624654814576, + "learning_rate": 4.25165382270273e-07, + "logits/chosen": 0.5099713206291199, + "logits/rejected": 0.5337219834327698, + "logps/chosen": -37.57986831665039, + "logps/rejected": -45.39601516723633, + "loss": 0.2483, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.15927743911743164, + "rewards/margins": 2.373776912689209, + "rewards/rejected": -2.2144994735717773, + "step": 284 + }, + { + "epoch": 0.6600846262341326, + "grad_norm": 24.232084058794833, + "learning_rate": 4.2372099224507875e-07, + "logits/chosen": 0.47430500388145447, + "logits/rejected": 0.5168524980545044, + "logps/chosen": -34.61323547363281, + "logps/rejected": -60.36859130859375, + "loss": 0.2904, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.013289166614413261, + "rewards/margins": 2.84716534614563, + "rewards/rejected": -2.860454797744751, + "step": 286 + }, + { + "epoch": 0.6647006026413643, + "grad_norm": 28.26074226923709, + "learning_rate": 4.2226530075426503e-07, + "logits/chosen": 0.5559656620025635, + "logits/rejected": 0.562049150466919, + "logps/chosen": -48.77291488647461, + "logps/rejected": -52.30695343017578, + "loss": 0.2904, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.016986362636089325, + "rewards/margins": 2.4160873889923096, + "rewards/rejected": -2.3991012573242188, + "step": 288 + }, + { + "epoch": 0.669316579048596, + "grad_norm": 25.964047989048964, + "learning_rate": 4.2079840249823106e-07, + "logits/chosen": 0.5188059210777283, + "logits/rejected": 0.5476034879684448, + "logps/chosen": -43.39430236816406, + "logps/rejected": -63.02970886230469, + "loss": 0.2964, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.22233732044696808, + "rewards/margins": 2.6584837436676025, + "rewards/rejected": -2.8808212280273438, + "step": 290 + }, + { + "epoch": 0.669316579048596, + "eval_logits/chosen": 0.41873642802238464, + "eval_logits/rejected": 0.44454658031463623, + "eval_logps/chosen": -41.64173126220703, + "eval_logps/rejected": -53.234169006347656, + "eval_loss": 0.27578282356262207, + "eval_rewards/accuracies": 0.8127880096435547, + "eval_rewards/chosen": 0.05177304521203041, + "eval_rewards/margins": 2.6496896743774414, + "eval_rewards/rejected": -2.597916603088379, + "eval_runtime": 220.2319, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 290 + }, + { + "epoch": 0.6739325554558276, + "grad_norm": 28.11981406671555, + "learning_rate": 4.193203929064353e-07, + "logits/chosen": 0.5352766513824463, + "logits/rejected": 0.5633915066719055, + "logps/chosen": -43.08574676513672, + "logps/rejected": -63.65277099609375, + "loss": 0.292, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -0.09769348800182343, + "rewards/margins": 2.7585980892181396, + "rewards/rejected": -2.8562917709350586, + "step": 292 + }, + { + "epoch": 0.6785485318630594, + "grad_norm": 22.159785280949862, + "learning_rate": 4.1783136813118705e-07, + "logits/chosen": 0.5104035139083862, + "logits/rejected": 0.5326347947120667, + "logps/chosen": -44.235877990722656, + "logps/rejected": -53.24985885620117, + "loss": 0.2764, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.3315318822860718, + "rewards/margins": 2.574824810028076, + "rewards/rejected": -2.9063568115234375, + "step": 294 + }, + { + "epoch": 0.6831645082702911, + "grad_norm": 16.58376439365046, + "learning_rate": 4.163314250413913e-07, + "logits/chosen": 0.5757681131362915, + "logits/rejected": 0.6053035855293274, + "logps/chosen": -40.00181579589844, + "logps/rejected": -50.29273986816406, + "loss": 0.193, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.18691450357437134, + "rewards/margins": 2.6521503925323486, + "rewards/rejected": -2.465236186981201, + "step": 296 + }, + { + "epoch": 0.6877804846775227, + "grad_norm": 32.319500846176076, + "learning_rate": 4.1482066121624716e-07, + "logits/chosen": 0.5265994668006897, + "logits/rejected": 0.5376725792884827, + "logps/chosen": -42.3819580078125, + "logps/rejected": -43.448524475097656, + "loss": 0.3285, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.1531985104084015, + "rewards/margins": 2.268404245376587, + "rewards/rejected": -2.115206003189087, + "step": 298 + }, + { + "epoch": 0.6923964610847545, + "grad_norm": 23.349636529497012, + "learning_rate": 4.1329917493889933e-07, + "logits/chosen": 0.43518775701522827, + "logits/rejected": 0.46238911151885986, + "logps/chosen": -39.432003021240234, + "logps/rejected": -52.38154983520508, + "loss": 0.2382, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.1465599089860916, + "rewards/margins": 2.628819704055786, + "rewards/rejected": -2.7753796577453613, + "step": 300 + }, + { + "epoch": 0.6923964610847545, + "eval_logits/chosen": 0.4236195683479309, + "eval_logits/rejected": 0.4493381381034851, + "eval_logps/chosen": -41.62788009643555, + "eval_logps/rejected": -53.235809326171875, + "eval_loss": 0.2743636965751648, + "eval_rewards/accuracies": 0.8122119903564453, + "eval_rewards/chosen": 0.05869903042912483, + "eval_rewards/margins": 2.6574366092681885, + "eval_rewards/rejected": -2.5987374782562256, + "eval_runtime": 220.281, + "eval_samples_per_second": 7.872, + "eval_steps_per_second": 1.97, + "step": 300 + }, + { + "epoch": 0.6970124374919862, + "grad_norm": 23.497513813632327, + "learning_rate": 4.117670651900446e-07, + "logits/chosen": 0.5692274570465088, + "logits/rejected": 0.5857737064361572, + "logps/chosen": -44.88375473022461, + "logps/rejected": -50.89904022216797, + "loss": 0.3059, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": -0.18370471894741058, + "rewards/margins": 2.1322684288024902, + "rewards/rejected": -2.3159730434417725, + "step": 302 + }, + { + "epoch": 0.7016284138992178, + "grad_norm": 31.67224576363876, + "learning_rate": 4.1022443164149237e-07, + "logits/chosen": 0.48219427466392517, + "logits/rejected": 0.5107440948486328, + "logps/chosen": -46.37804412841797, + "logps/rejected": -62.33393859863281, + "loss": 0.2685, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.04567752406001091, + "rewards/margins": 2.84682559967041, + "rewards/rejected": -2.892503261566162, + "step": 304 + }, + { + "epoch": 0.7062443903064496, + "grad_norm": 19.857257644454698, + "learning_rate": 4.086713746496808e-07, + "logits/chosen": 0.5637336373329163, + "logits/rejected": 0.588976263999939, + "logps/chosen": -39.28482437133789, + "logps/rejected": -50.71957778930664, + "loss": 0.2575, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.2317693531513214, + "rewards/margins": 2.6872549057006836, + "rewards/rejected": -2.4554860591888428, + "step": 306 + }, + { + "epoch": 0.7108603667136812, + "grad_norm": 17.71463775233371, + "learning_rate": 4.0710799524914805e-07, + "logits/chosen": 0.5934479832649231, + "logits/rejected": 0.6081465482711792, + "logps/chosen": -50.33334732055664, + "logps/rejected": -55.25143814086914, + "loss": 0.2103, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.034923017024993896, + "rewards/margins": 2.6961231231689453, + "rewards/rejected": -2.731046199798584, + "step": 308 + }, + { + "epoch": 0.7154763431209129, + "grad_norm": 19.132153588643654, + "learning_rate": 4.055343951459592e-07, + "logits/chosen": 0.5560102462768555, + "logits/rejected": 0.5947719812393188, + "logps/chosen": -37.43670654296875, + "logps/rejected": -57.06461715698242, + "loss": 0.226, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.07254935055971146, + "rewards/margins": 2.918682336807251, + "rewards/rejected": -2.991231918334961, + "step": 310 + }, + { + "epoch": 0.7154763431209129, + "eval_logits/chosen": 0.42303159832954407, + "eval_logits/rejected": 0.44889286160469055, + "eval_logps/chosen": -41.60685348510742, + "eval_logps/rejected": -53.284358978271484, + "eval_loss": 0.27253130078315735, + "eval_rewards/accuracies": 0.8133640289306641, + "eval_rewards/chosen": 0.06921074539422989, + "eval_rewards/margins": 2.692223072052002, + "eval_rewards/rejected": -2.6230127811431885, + "eval_runtime": 220.2961, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 310 + }, + { + "epoch": 0.7200923195281447, + "grad_norm": 20.574269162073108, + "learning_rate": 4.0395067671108985e-07, + "logits/chosen": 0.47218936681747437, + "logits/rejected": 0.5014721155166626, + "logps/chosen": -35.916664123535156, + "logps/rejected": -44.856101989746094, + "loss": 0.2579, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.182376891374588, + "rewards/margins": 2.569021701812744, + "rewards/rejected": -2.3866446018218994, + "step": 312 + }, + { + "epoch": 0.7247082959353763, + "grad_norm": 30.250167869534483, + "learning_rate": 4.0235694297376637e-07, + "logits/chosen": 0.5631113648414612, + "logits/rejected": 0.5769122242927551, + "logps/chosen": -49.87733459472656, + "logps/rejected": -55.8229866027832, + "loss": 0.2861, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.1988232433795929, + "rewards/margins": 2.635685443878174, + "rewards/rejected": -2.4368624687194824, + "step": 314 + }, + { + "epoch": 0.729324272342608, + "grad_norm": 32.09859733085628, + "learning_rate": 4.0075329761476347e-07, + "logits/chosen": 0.5582194924354553, + "logits/rejected": 0.5716796517372131, + "logps/chosen": -44.06077575683594, + "logps/rejected": -48.060577392578125, + "loss": 0.2637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14417774975299835, + "rewards/margins": 2.182429313659668, + "rewards/rejected": -2.3266072273254395, + "step": 316 + }, + { + "epoch": 0.7339402487498398, + "grad_norm": 20.839702603979845, + "learning_rate": 3.991398449596588e-07, + "logits/chosen": 0.5104639530181885, + "logits/rejected": 0.5302228331565857, + "logps/chosen": -46.450565338134766, + "logps/rejected": -56.8250732421875, + "loss": 0.2178, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.05337013676762581, + "rewards/margins": 2.7899389266967773, + "rewards/rejected": -2.7365689277648926, + "step": 318 + }, + { + "epoch": 0.7385562251570714, + "grad_norm": 35.607964067039056, + "learning_rate": 3.9751668997204647e-07, + "logits/chosen": 0.573165774345398, + "logits/rejected": 0.592732310295105, + "logps/chosen": -46.10280990600586, + "logps/rejected": -53.3104248046875, + "loss": 0.238, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.08940169960260391, + "rewards/margins": 2.5656909942626953, + "rewards/rejected": -2.655092716217041, + "step": 320 + }, + { + "epoch": 0.7385562251570714, + "eval_logits/chosen": 0.4224054217338562, + "eval_logits/rejected": 0.4482380449771881, + "eval_logps/chosen": -41.65960693359375, + "eval_logps/rejected": -53.47556686401367, + "eval_loss": 0.2701371908187866, + "eval_rewards/accuracies": 0.8185483813285828, + "eval_rewards/chosen": 0.04283595457673073, + "eval_rewards/margins": 2.761453866958618, + "eval_rewards/rejected": -2.718618154525757, + "eval_runtime": 220.4956, + "eval_samples_per_second": 7.864, + "eval_steps_per_second": 1.968, + "step": 320 + }, + { + "epoch": 0.7431722015643031, + "grad_norm": 40.34998221595971, + "learning_rate": 3.958839382467084e-07, + "logits/chosen": 0.5077357888221741, + "logits/rejected": 0.5302278995513916, + "logps/chosen": -38.23583984375, + "logps/rejected": -49.62001037597656, + "loss": 0.2911, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.431808739900589, + "rewards/margins": 2.4383790493011475, + "rewards/rejected": -2.0065698623657227, + "step": 322 + }, + { + "epoch": 0.7477881779715349, + "grad_norm": 37.34949673704143, + "learning_rate": 3.9424169600274494e-07, + "logits/chosen": 0.5166856646537781, + "logits/rejected": 0.5311781167984009, + "logps/chosen": -43.24025344848633, + "logps/rejected": -48.49333190917969, + "loss": 0.3054, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.1698003113269806, + "rewards/margins": 2.2522177696228027, + "rewards/rejected": -2.422018051147461, + "step": 324 + }, + { + "epoch": 0.7524041543787665, + "grad_norm": 25.91010722050029, + "learning_rate": 3.9259007007666436e-07, + "logits/chosen": 0.5167285203933716, + "logits/rejected": 0.5338759422302246, + "logps/chosen": -44.82267761230469, + "logps/rejected": -55.40620803833008, + "loss": 0.2723, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.06528851389884949, + "rewards/margins": 2.759828805923462, + "rewards/rejected": -2.694540023803711, + "step": 326 + }, + { + "epoch": 0.7570201307859982, + "grad_norm": 30.862683948057615, + "learning_rate": 3.909291679154332e-07, + "logits/chosen": 0.5040656328201294, + "logits/rejected": 0.5386430025100708, + "logps/chosen": -42.25190734863281, + "logps/rejected": -62.51930618286133, + "loss": 0.2759, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.2548324167728424, + "rewards/margins": 3.0783848762512207, + "rewards/rejected": -3.333217144012451, + "step": 328 + }, + { + "epoch": 0.7616361071932299, + "grad_norm": 19.125155732205084, + "learning_rate": 3.892590975694858e-07, + "logits/chosen": 0.49563461542129517, + "logits/rejected": 0.539116621017456, + "logps/chosen": -39.31736755371094, + "logps/rejected": -60.45228576660156, + "loss": 0.2182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16796639561653137, + "rewards/margins": 3.4695467948913574, + "rewards/rejected": -3.301579713821411, + "step": 330 + }, + { + "epoch": 0.7616361071932299, + "eval_logits/chosen": 0.4225333333015442, + "eval_logits/rejected": 0.44842836260795593, + "eval_logps/chosen": -41.670494079589844, + "eval_logps/rejected": -53.553314208984375, + "eval_loss": 0.2688952684402466, + "eval_rewards/accuracies": 0.8145161271095276, + "eval_rewards/chosen": 0.037393342703580856, + "eval_rewards/margins": 2.7948849201202393, + "eval_rewards/rejected": -2.7574915885925293, + "eval_runtime": 220.4734, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.968, + "step": 330 + }, + { + "epoch": 0.7662520836004616, + "grad_norm": 20.197390141727503, + "learning_rate": 3.875799676856952e-07, + "logits/chosen": 0.5481100082397461, + "logits/rejected": 0.5680783987045288, + "logps/chosen": -43.26856994628906, + "logps/rejected": -54.90293884277344, + "loss": 0.2148, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.2920362651348114, + "rewards/margins": 2.9112956523895264, + "rewards/rejected": -3.20333194732666, + "step": 332 + }, + { + "epoch": 0.7708680600076933, + "grad_norm": 28.41138671183374, + "learning_rate": 3.858918875003053e-07, + "logits/chosen": 0.5375738143920898, + "logits/rejected": 0.5755133628845215, + "logps/chosen": -41.622859954833984, + "logps/rejected": -61.92311096191406, + "loss": 0.2733, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.09028860926628113, + "rewards/margins": 3.286768674850464, + "rewards/rejected": -3.3770573139190674, + "step": 334 + }, + { + "epoch": 0.775484036414925, + "grad_norm": 16.265551276537238, + "learning_rate": 3.8419496683182396e-07, + "logits/chosen": 0.5556432604789734, + "logits/rejected": 0.5942565202713013, + "logps/chosen": -41.74842071533203, + "logps/rejected": -57.50096893310547, + "loss": 0.1896, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.0623447448015213, + "rewards/margins": 2.878957748413086, + "rewards/rejected": -2.941302537918091, + "step": 336 + }, + { + "epoch": 0.7801000128221567, + "grad_norm": 26.59915287717055, + "learning_rate": 3.824893160738792e-07, + "logits/chosen": 0.5246456861495972, + "logits/rejected": 0.553848385810852, + "logps/chosen": -42.39156723022461, + "logps/rejected": -57.20592498779297, + "loss": 0.2682, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": -0.07438618689775467, + "rewards/margins": 3.046879291534424, + "rewards/rejected": -3.1212656497955322, + "step": 338 + }, + { + "epoch": 0.7847159892293883, + "grad_norm": 23.023616857684974, + "learning_rate": 3.8077504618803737e-07, + "logits/chosen": 0.580450713634491, + "logits/rejected": 0.5835237503051758, + "logps/chosen": -48.9189567565918, + "logps/rejected": -47.836578369140625, + "loss": 0.2668, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.06672815978527069, + "rewards/margins": 2.457933187484741, + "rewards/rejected": -2.5246615409851074, + "step": 340 + }, + { + "epoch": 0.7847159892293883, + "eval_logits/chosen": 0.4240727126598358, + "eval_logits/rejected": 0.4500102698802948, + "eval_logps/chosen": -41.714290618896484, + "eval_logps/rejected": -53.6696662902832, + "eval_loss": 0.2670309841632843, + "eval_rewards/accuracies": 0.8179723620414734, + "eval_rewards/chosen": 0.015493539161980152, + "eval_rewards/margins": 2.83115816116333, + "eval_rewards/rejected": -2.815664768218994, + "eval_runtime": 220.6721, + "eval_samples_per_second": 7.858, + "eval_steps_per_second": 1.967, + "step": 340 + }, + { + "epoch": 0.7893319656366201, + "grad_norm": 16.479244956266236, + "learning_rate": 3.7905226869658446e-07, + "logits/chosen": 0.4684799015522003, + "logits/rejected": 0.4874458909034729, + "logps/chosen": -43.62626647949219, + "logps/rejected": -55.70362854003906, + "loss": 0.2494, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.13190358877182007, + "rewards/margins": 2.8091206550598145, + "rewards/rejected": -2.6772167682647705, + "step": 342 + }, + { + "epoch": 0.7939479420438518, + "grad_norm": 24.369877883114157, + "learning_rate": 3.773210956752709e-07, + "logits/chosen": 0.544243574142456, + "logits/rejected": 0.5578660368919373, + "logps/chosen": -40.1495246887207, + "logps/rejected": -44.17314910888672, + "loss": 0.2798, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.020625757053494453, + "rewards/margins": 2.502214193344116, + "rewards/rejected": -2.481588363647461, + "step": 344 + }, + { + "epoch": 0.7985639184510834, + "grad_norm": 25.623903462647995, + "learning_rate": 3.7558163974602093e-07, + "logits/chosen": 0.474899560213089, + "logits/rejected": 0.5161857008934021, + "logps/chosen": -37.74607467651367, + "logps/rejected": -55.48906707763672, + "loss": 0.2419, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.001830246765166521, + "rewards/margins": 2.923034906387329, + "rewards/rejected": -2.924865245819092, + "step": 346 + }, + { + "epoch": 0.8031798948583152, + "grad_norm": 25.184522607734593, + "learning_rate": 3.73834014069605e-07, + "logits/chosen": 0.558302104473114, + "logits/rejected": 0.5833041667938232, + "logps/chosen": -48.4046630859375, + "logps/rejected": -61.20756149291992, + "loss": 0.2374, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.1346227377653122, + "rewards/margins": 2.8843278884887695, + "rewards/rejected": -3.0189502239227295, + "step": 348 + }, + { + "epoch": 0.8077958712655469, + "grad_norm": 24.77024105098058, + "learning_rate": 3.7207833233827914e-07, + "logits/chosen": 0.4649287462234497, + "logits/rejected": 0.482571542263031, + "logps/chosen": -44.39389419555664, + "logps/rejected": -58.24624252319336, + "loss": 0.2534, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.14530682563781738, + "rewards/margins": 3.2228527069091797, + "rewards/rejected": -3.368159532546997, + "step": 350 + }, + { + "epoch": 0.8077958712655469, + "eval_logits/chosen": 0.42746230959892273, + "eval_logits/rejected": 0.45336535573005676, + "eval_logps/chosen": -42.037269592285156, + "eval_logps/rejected": -54.03358459472656, + "eval_loss": 0.2634715437889099, + "eval_rewards/accuracies": 0.8168202638626099, + "eval_rewards/chosen": -0.1459963023662567, + "eval_rewards/margins": 2.8516335487365723, + "eval_rewards/rejected": -2.9976296424865723, + "eval_runtime": 220.3701, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.969, + "step": 350 + }, + { + "epoch": 0.8124118476727785, + "grad_norm": 26.201135314502036, + "learning_rate": 3.7031470876838786e-07, + "logits/chosen": 0.5293068289756775, + "logits/rejected": 0.5655782222747803, + "logps/chosen": -42.89842224121094, + "logps/rejected": -63.14483642578125, + "loss": 0.2516, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.4706004559993744, + "rewards/margins": 2.8817062377929688, + "rewards/rejected": -3.352307081222534, + "step": 352 + }, + { + "epoch": 0.8170278240800103, + "grad_norm": 22.294887268242963, + "learning_rate": 3.6854325809293455e-07, + "logits/chosen": 0.49771615862846375, + "logits/rejected": 0.5413529276847839, + "logps/chosen": -36.90867233276367, + "logps/rejected": -64.4770278930664, + "loss": 0.2284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27428972721099854, + "rewards/margins": 3.501157522201538, + "rewards/rejected": -3.775447368621826, + "step": 354 + }, + { + "epoch": 0.821643800487242, + "grad_norm": 28.188753078893058, + "learning_rate": 3.6676409555411653e-07, + "logits/chosen": 0.5484297871589661, + "logits/rejected": 0.5813949704170227, + "logps/chosen": -45.460365295410156, + "logps/rejected": -60.86439895629883, + "loss": 0.2542, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4246326684951782, + "rewards/margins": 3.2056918144226074, + "rewards/rejected": -3.630324363708496, + "step": 356 + }, + { + "epoch": 0.8262597768944736, + "grad_norm": 17.14121226520804, + "learning_rate": 3.6497733689582866e-07, + "logits/chosen": 0.48876845836639404, + "logits/rejected": 0.5145962238311768, + "logps/chosen": -39.37761688232422, + "logps/rejected": -49.643211364746094, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21149006485939026, + "rewards/margins": 2.893353223800659, + "rewards/rejected": -3.1048433780670166, + "step": 358 + }, + { + "epoch": 0.8308757533017054, + "grad_norm": 35.17955186267088, + "learning_rate": 3.631830983561335e-07, + "logits/chosen": 0.573662519454956, + "logits/rejected": 0.5948094725608826, + "logps/chosen": -47.85080337524414, + "logps/rejected": -52.225006103515625, + "loss": 0.2586, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.3559052646160126, + "rewards/margins": 2.786222219467163, + "rewards/rejected": -3.142127513885498, + "step": 360 + }, + { + "epoch": 0.8308757533017054, + "eval_logits/chosen": 0.42756161093711853, + "eval_logits/rejected": 0.45349106192588806, + "eval_logps/chosen": -42.38340759277344, + "eval_logps/rejected": -54.44844436645508, + "eval_loss": 0.2630784213542938, + "eval_rewards/accuracies": 0.8179723620414734, + "eval_rewards/chosen": -0.31906506419181824, + "eval_rewards/margins": 2.8859920501708984, + "eval_rewards/rejected": -3.205056667327881, + "eval_runtime": 220.2057, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 360 + }, + { + "epoch": 0.835491729708937, + "grad_norm": 36.03053976982613, + "learning_rate": 3.613814966596991e-07, + "logits/chosen": 0.5263631343841553, + "logits/rejected": 0.5573300123214722, + "logps/chosen": -43.24696731567383, + "logps/rejected": -57.23331069946289, + "loss": 0.2526, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.4683598279953003, + "rewards/margins": 3.082267999649048, + "rewards/rejected": -3.5506277084350586, + "step": 362 + }, + { + "epoch": 0.8401077061161687, + "grad_norm": 15.328563865471402, + "learning_rate": 3.595726490102059e-07, + "logits/chosen": 0.5707637071609497, + "logits/rejected": 0.6143693327903748, + "logps/chosen": -40.44147491455078, + "logps/rejected": -62.61209487915039, + "loss": 0.1294, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.3496915102005005, + "rewards/margins": 3.486618995666504, + "rewards/rejected": -3.836310863494873, + "step": 364 + }, + { + "epoch": 0.8447236825234005, + "grad_norm": 15.002635114989888, + "learning_rate": 3.577566730827214e-07, + "logits/chosen": 0.5126733779907227, + "logits/rejected": 0.5439874529838562, + "logps/chosen": -40.29549789428711, + "logps/rejected": -56.204898834228516, + "loss": 0.2951, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -0.3362084925174713, + "rewards/margins": 2.846021890640259, + "rewards/rejected": -3.182230234146118, + "step": 366 + }, + { + "epoch": 0.8493396589306321, + "grad_norm": 25.52691859216037, + "learning_rate": 3.559336870160453e-07, + "logits/chosen": 0.5128374099731445, + "logits/rejected": 0.5424924492835999, + "logps/chosen": -38.71543884277344, + "logps/rejected": -52.61689758300781, + "loss": 0.2084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28658950328826904, + "rewards/margins": 3.0817792415618896, + "rewards/rejected": -3.368368625640869, + "step": 368 + }, + { + "epoch": 0.8539556353378638, + "grad_norm": 30.283513234320385, + "learning_rate": 3.541038094050241e-07, + "logits/chosen": 0.515430212020874, + "logits/rejected": 0.5466374158859253, + "logps/chosen": -45.59136962890625, + "logps/rejected": -63.18849182128906, + "loss": 0.2378, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.5768634676933289, + "rewards/margins": 3.5630674362182617, + "rewards/rejected": -4.139930725097656, + "step": 370 + }, + { + "epoch": 0.8539556353378638, + "eval_logits/chosen": 0.4274056553840637, + "eval_logits/rejected": 0.45338377356529236, + "eval_logps/chosen": -43.063682556152344, + "eval_logps/rejected": -55.225093841552734, + "eval_loss": 0.2617854177951813, + "eval_rewards/accuracies": 0.817396342754364, + "eval_rewards/chosen": -0.659203290939331, + "eval_rewards/margins": 2.9341788291931152, + "eval_rewards/rejected": -3.5933821201324463, + "eval_runtime": 220.2088, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 370 + }, + { + "epoch": 0.8585716117450956, + "grad_norm": 30.9826241797592, + "learning_rate": 3.52267159292835e-07, + "logits/chosen": 0.4993041455745697, + "logits/rejected": 0.5248599052429199, + "logps/chosen": -44.83211898803711, + "logps/rejected": -61.29323959350586, + "loss": 0.2333, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.7047384977340698, + "rewards/margins": 3.358118772506714, + "rewards/rejected": -4.062856674194336, + "step": 372 + }, + { + "epoch": 0.8631875881523272, + "grad_norm": 16.52463887201103, + "learning_rate": 3.5042385616324236e-07, + "logits/chosen": 0.4287330210208893, + "logits/rejected": 0.46707651019096375, + "logps/chosen": -36.363590240478516, + "logps/rejected": -59.82657241821289, + "loss": 0.22, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8720024228096008, + "rewards/margins": 3.389249086380005, + "rewards/rejected": -4.261251449584961, + "step": 374 + }, + { + "epoch": 0.8678035645595589, + "grad_norm": 15.500715269356169, + "learning_rate": 3.485740199328244e-07, + "logits/chosen": 0.5408291816711426, + "logits/rejected": 0.5578600764274597, + "logps/chosen": -50.285335540771484, + "logps/rejected": -54.07209014892578, + "loss": 0.1876, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.5448592305183411, + "rewards/margins": 3.2346181869506836, + "rewards/rejected": -3.779477119445801, + "step": 376 + }, + { + "epoch": 0.8724195409667906, + "grad_norm": 12.222084345575727, + "learning_rate": 3.4671777094317196e-07, + "logits/chosen": 0.5013281106948853, + "logits/rejected": 0.5262949466705322, + "logps/chosen": -46.47956848144531, + "logps/rejected": -53.49814224243164, + "loss": 0.1677, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.7341945767402649, + "rewards/margins": 3.0543222427368164, + "rewards/rejected": -3.7885169982910156, + "step": 378 + }, + { + "epoch": 0.8770355173740223, + "grad_norm": 22.531696347522484, + "learning_rate": 3.448552299530595e-07, + "logits/chosen": 0.5649933218955994, + "logits/rejected": 0.5860426425933838, + "logps/chosen": -42.52098083496094, + "logps/rejected": -52.308616638183594, + "loss": 0.3071, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -0.9869860410690308, + "rewards/margins": 2.7113142013549805, + "rewards/rejected": -3.698300361633301, + "step": 380 + }, + { + "epoch": 0.8770355173740223, + "eval_logits/chosen": 0.4274827539920807, + "eval_logits/rejected": 0.45349830389022827, + "eval_logps/chosen": -43.129615783691406, + "eval_logps/rejected": -55.33893585205078, + "eval_loss": 0.2627149224281311, + "eval_rewards/accuracies": 0.8156682252883911, + "eval_rewards/chosen": -0.6921693086624146, + "eval_rewards/margins": 2.958131790161133, + "eval_rewards/rejected": -3.650301218032837, + "eval_runtime": 220.3046, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 380 + }, + { + "epoch": 0.881651493781254, + "grad_norm": 39.03269809250303, + "learning_rate": 3.429865181305894e-07, + "logits/chosen": 0.5594089031219482, + "logits/rejected": 0.5762946605682373, + "logps/chosen": -46.85918045043945, + "logps/rejected": -55.68655776977539, + "loss": 0.2915, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.9153691530227661, + "rewards/margins": 2.779404401779175, + "rewards/rejected": -3.694772958755493, + "step": 382 + }, + { + "epoch": 0.8862674701884857, + "grad_norm": 25.617922410092657, + "learning_rate": 3.411117570453091e-07, + "logits/chosen": 0.5484945774078369, + "logits/rejected": 0.5738579034805298, + "logps/chosen": -42.73631286621094, + "logps/rejected": -53.853271484375, + "loss": 0.2369, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.7328565120697021, + "rewards/margins": 2.8266656398773193, + "rewards/rejected": -3.5595223903656006, + "step": 384 + }, + { + "epoch": 0.8908834465957174, + "grad_norm": 30.869961559508535, + "learning_rate": 3.392310686603025e-07, + "logits/chosen": 0.534080982208252, + "logits/rejected": 0.5444844365119934, + "logps/chosen": -42.41215515136719, + "logps/rejected": -50.85294723510742, + "loss": 0.2909, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": -0.9006066918373108, + "rewards/margins": 2.361262559890747, + "rewards/rejected": -3.261868953704834, + "step": 386 + }, + { + "epoch": 0.895499423002949, + "grad_norm": 19.657432685783167, + "learning_rate": 3.3734457532425554e-07, + "logits/chosen": 0.5231594443321228, + "logits/rejected": 0.5530441403388977, + "logps/chosen": -42.48830795288086, + "logps/rejected": -57.00692367553711, + "loss": 0.2606, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.6170899271965027, + "rewards/margins": 3.237041711807251, + "rewards/rejected": -3.8541314601898193, + "step": 388 + }, + { + "epoch": 0.9001153994101808, + "grad_norm": 24.399140672578795, + "learning_rate": 3.354523997634969e-07, + "logits/chosen": 0.540899932384491, + "logits/rejected": 0.5695917010307312, + "logps/chosen": -44.531185150146484, + "logps/rejected": -58.8494873046875, + "loss": 0.2251, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.7790883183479309, + "rewards/margins": 3.128167152404785, + "rewards/rejected": -3.9072554111480713, + "step": 390 + }, + { + "epoch": 0.9001153994101808, + "eval_logits/chosen": 0.42857107520103455, + "eval_logits/rejected": 0.4546278119087219, + "eval_logps/chosen": -43.16852951049805, + "eval_logps/rejected": -55.42344665527344, + "eval_loss": 0.2621525228023529, + "eval_rewards/accuracies": 0.8179723620414734, + "eval_rewards/chosen": -0.7116276621818542, + "eval_rewards/margins": 2.980929374694824, + "eval_rewards/rejected": -3.6925570964813232, + "eval_runtime": 220.3143, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 390 + }, + { + "epoch": 0.9047313758174125, + "grad_norm": 35.01908054863291, + "learning_rate": 3.3355466507401374e-07, + "logits/chosen": 0.5315423607826233, + "logits/rejected": 0.5454668998718262, + "logps/chosen": -42.16218185424805, + "logps/rejected": -44.85585403442383, + "loss": 0.372, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -0.805086612701416, + "rewards/margins": 2.338005542755127, + "rewards/rejected": -3.143092155456543, + "step": 392 + }, + { + "epoch": 0.9093473522246441, + "grad_norm": 21.288998506479572, + "learning_rate": 3.3165149471344394e-07, + "logits/chosen": 0.5552914142608643, + "logits/rejected": 0.5818530321121216, + "logps/chosen": -42.95904541015625, + "logps/rejected": -52.76212692260742, + "loss": 0.2934, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.9580552577972412, + "rewards/margins": 2.6676671504974365, + "rewards/rejected": -3.6257221698760986, + "step": 394 + }, + { + "epoch": 0.9139633286318759, + "grad_norm": 25.556003693396036, + "learning_rate": 3.297430124930444e-07, + "logits/chosen": 0.582655668258667, + "logits/rejected": 0.5952574014663696, + "logps/chosen": -48.771934509277344, + "logps/rejected": -54.426483154296875, + "loss": 0.3223, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -0.6146318912506104, + "rewards/margins": 2.4974234104156494, + "rewards/rejected": -3.112055540084839, + "step": 396 + }, + { + "epoch": 0.9185793050391076, + "grad_norm": 23.905362174336005, + "learning_rate": 3.2782934256963647e-07, + "logits/chosen": 0.5089656114578247, + "logits/rejected": 0.5398065447807312, + "logps/chosen": -45.75530242919922, + "logps/rejected": -61.64253234863281, + "loss": 0.2549, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.6105983853340149, + "rewards/margins": 3.1589841842651367, + "rewards/rejected": -3.769582509994507, + "step": 398 + }, + { + "epoch": 0.9231952814463392, + "grad_norm": 24.17532494020093, + "learning_rate": 3.259106094375289e-07, + "logits/chosen": 0.539167046546936, + "logits/rejected": 0.5812445282936096, + "logps/chosen": -39.31736755371094, + "logps/rejected": -63.33793640136719, + "loss": 0.2698, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.3948301374912262, + "rewards/margins": 3.442387819290161, + "rewards/rejected": -3.8372182846069336, + "step": 400 + }, + { + "epoch": 0.9231952814463392, + "eval_logits/chosen": 0.42656469345092773, + "eval_logits/rejected": 0.45276370644569397, + "eval_logps/chosen": -42.66855239868164, + "eval_logps/rejected": -55.0075798034668, + "eval_loss": 0.2560158371925354, + "eval_rewards/accuracies": 0.8231566548347473, + "eval_rewards/chosen": -0.46163854002952576, + "eval_rewards/margins": 3.0229856967926025, + "eval_rewards/rejected": -3.4846243858337402, + "eval_runtime": 220.2216, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 400 + }, + { + "epoch": 0.927811257853571, + "grad_norm": 30.671620714098214, + "learning_rate": 3.239869379204189e-07, + "logits/chosen": 0.4974105656147003, + "logits/rejected": 0.5221477746963501, + "logps/chosen": -45.057281494140625, + "logps/rejected": -56.83816909790039, + "loss": 0.2017, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.5868238210678101, + "rewards/margins": 3.3964414596557617, + "rewards/rejected": -3.9832653999328613, + "step": 402 + }, + { + "epoch": 0.9324272342608027, + "grad_norm": 24.915176146115876, + "learning_rate": 3.2205845316327144e-07, + "logits/chosen": 0.5429517030715942, + "logits/rejected": 0.5683455467224121, + "logps/chosen": -34.97327423095703, + "logps/rejected": -46.666717529296875, + "loss": 0.3399, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -0.43591320514678955, + "rewards/margins": 2.185106039047241, + "rewards/rejected": -2.6210196018218994, + "step": 404 + }, + { + "epoch": 0.9370432106680343, + "grad_norm": 23.867375292949593, + "learning_rate": 3.2012528062417845e-07, + "logits/chosen": 0.5323294997215271, + "logits/rejected": 0.5459015369415283, + "logps/chosen": -43.10551071166992, + "logps/rejected": -47.71934127807617, + "loss": 0.2436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7240028977394104, + "rewards/margins": 2.4708030223846436, + "rewards/rejected": -3.1948060989379883, + "step": 406 + }, + { + "epoch": 0.9416591870752661, + "grad_norm": 15.007721932706033, + "learning_rate": 3.1818754606619643e-07, + "logits/chosen": 0.5331852436065674, + "logits/rejected": 0.564946174621582, + "logps/chosen": -36.540283203125, + "logps/rejected": -57.03317642211914, + "loss": 0.2822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16474466025829315, + "rewards/margins": 3.167923927307129, + "rewards/rejected": -3.3326683044433594, + "step": 408 + }, + { + "epoch": 0.9462751634824977, + "grad_norm": 22.364487052769828, + "learning_rate": 3.162453755491655e-07, + "logits/chosen": 0.49684393405914307, + "logits/rejected": 0.5316374897956848, + "logps/chosen": -38.39241027832031, + "logps/rejected": -59.15244674682617, + "loss": 0.1874, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.322665810585022, + "rewards/margins": 3.4969892501831055, + "rewards/rejected": -3.819655179977417, + "step": 410 + }, + { + "epoch": 0.9462751634824977, + "eval_logits/chosen": 0.4290708899497986, + "eval_logits/rejected": 0.45515918731689453, + "eval_logps/chosen": -42.679603576660156, + "eval_logps/rejected": -55.10276412963867, + "eval_loss": 0.2565246820449829, + "eval_rewards/accuracies": 0.8191244006156921, + "eval_rewards/chosen": -0.467162162065506, + "eval_rewards/margins": 3.065053939819336, + "eval_rewards/rejected": -3.5322158336639404, + "eval_runtime": 220.2891, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 410 + }, + { + "epoch": 0.9508911398897294, + "grad_norm": 25.19862106785063, + "learning_rate": 3.142988954215079e-07, + "logits/chosen": 0.5264102816581726, + "logits/rejected": 0.5622512698173523, + "logps/chosen": -43.48373794555664, + "logps/rejected": -66.42120361328125, + "loss": 0.2996, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.48827776312828064, + "rewards/margins": 3.3450686931610107, + "rewards/rejected": -3.833346128463745, + "step": 412 + }, + { + "epoch": 0.9555071162969612, + "grad_norm": 21.74301345510537, + "learning_rate": 3.1234823231200925e-07, + "logits/chosen": 0.5031583309173584, + "logits/rejected": 0.5540390014648438, + "logps/chosen": -40.93600845336914, + "logps/rejected": -66.30878448486328, + "loss": 0.2428, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.5792509317398071, + "rewards/margins": 3.6368870735168457, + "rewards/rejected": -4.2161383628845215, + "step": 414 + }, + { + "epoch": 0.9601230927041928, + "grad_norm": 22.436508219334904, + "learning_rate": 3.1039351312157993e-07, + "logits/chosen": 0.56053227186203, + "logits/rejected": 0.590539813041687, + "logps/chosen": -41.67660140991211, + "logps/rejected": -58.28109359741211, + "loss": 0.2048, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.6333367228507996, + "rewards/margins": 3.312451124191284, + "rewards/rejected": -3.9457881450653076, + "step": 416 + }, + { + "epoch": 0.9647390691114246, + "grad_norm": 36.50210265432233, + "learning_rate": 3.0843486501499967e-07, + "logits/chosen": 0.508413553237915, + "logits/rejected": 0.5429882407188416, + "logps/chosen": -42.58755111694336, + "logps/rejected": -52.10399627685547, + "loss": 0.3069, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.4402269721031189, + "rewards/margins": 2.6428239345550537, + "rewards/rejected": -3.0830507278442383, + "step": 418 + }, + { + "epoch": 0.9693550455186563, + "grad_norm": 19.432988353108243, + "learning_rate": 3.064724154126449e-07, + "logits/chosen": 0.48101869225502014, + "logits/rejected": 0.49470260739326477, + "logps/chosen": -43.99076461791992, + "logps/rejected": -47.8154411315918, + "loss": 0.2486, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.6770768761634827, + "rewards/margins": 2.6182446479797363, + "rewards/rejected": -3.2953217029571533, + "step": 420 + }, + { + "epoch": 0.9693550455186563, + "eval_logits/chosen": 0.4298844337463379, + "eval_logits/rejected": 0.45596131682395935, + "eval_logps/chosen": -42.74457550048828, + "eval_logps/rejected": -55.1827278137207, + "eval_loss": 0.2540464699268341, + "eval_rewards/accuracies": 0.8231566548347473, + "eval_rewards/chosen": -0.4996483027935028, + "eval_rewards/margins": 3.072551727294922, + "eval_rewards/rejected": -3.572199821472168, + "eval_runtime": 220.4655, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.969, + "step": 420 + }, + { + "epoch": 0.9739710219258879, + "grad_norm": 21.396529357952137, + "learning_rate": 3.045062919821995e-07, + "logits/chosen": 0.5096142292022705, + "logits/rejected": 0.5509178638458252, + "logps/chosen": -40.65134811401367, + "logps/rejected": -64.13406372070312, + "loss": 0.2407, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.25429394841194153, + "rewards/margins": 3.5406899452209473, + "rewards/rejected": -3.7949838638305664, + "step": 422 + }, + { + "epoch": 0.9785869983331197, + "grad_norm": 27.30197314549755, + "learning_rate": 3.0253662263034925e-07, + "logits/chosen": 0.5253940224647522, + "logits/rejected": 0.5617537498474121, + "logps/chosen": -44.63224792480469, + "logps/rejected": -62.29665756225586, + "loss": 0.2666, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.6128353476524353, + "rewards/margins": 3.4666247367858887, + "rewards/rejected": -4.079460144042969, + "step": 424 + }, + { + "epoch": 0.9832029747403513, + "grad_norm": 40.51282949087652, + "learning_rate": 3.005635354944606e-07, + "logits/chosen": 0.5502428412437439, + "logits/rejected": 0.5616468787193298, + "logps/chosen": -46.97676467895508, + "logps/rejected": -46.36595153808594, + "loss": 0.2894, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.7273317575454712, + "rewards/margins": 2.6478114128112793, + "rewards/rejected": -3.375143051147461, + "step": 426 + }, + { + "epoch": 0.987818951147583, + "grad_norm": 23.92512657865844, + "learning_rate": 2.9858715893424504e-07, + "logits/chosen": 0.5228149890899658, + "logits/rejected": 0.5698718428611755, + "logps/chosen": -40.91889953613281, + "logps/rejected": -64.06893920898438, + "loss": 0.1794, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.32393407821655273, + "rewards/margins": 3.8048884868621826, + "rewards/rejected": -4.128821849822998, + "step": 428 + }, + { + "epoch": 0.9924349275548148, + "grad_norm": 18.33017798245734, + "learning_rate": 2.966076215234082e-07, + "logits/chosen": 0.5833015441894531, + "logits/rejected": 0.6151509881019592, + "logps/chosen": -47.47243118286133, + "logps/rejected": -64.26097869873047, + "loss": 0.2098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2687421441078186, + "rewards/margins": 3.582411766052246, + "rewards/rejected": -3.85115385055542, + "step": 430 + }, + { + "epoch": 0.9924349275548148, + "eval_logits/chosen": 0.42911431193351746, + "eval_logits/rejected": 0.45535048842430115, + "eval_logps/chosen": -42.6432991027832, + "eval_logps/rejected": -55.0967903137207, + "eval_loss": 0.25298023223876953, + "eval_rewards/accuracies": 0.8237327337265015, + "eval_rewards/chosen": -0.4490084946155548, + "eval_rewards/margins": 3.0802206993103027, + "eval_rewards/rejected": -3.5292294025421143, + "eval_runtime": 220.5016, + "eval_samples_per_second": 7.864, + "eval_steps_per_second": 1.968, + "step": 430 + }, + { + "epoch": 0.9970509039620464, + "grad_norm": 24.845062608395242, + "learning_rate": 2.94625052041286e-07, + "logits/chosen": 0.529398500919342, + "logits/rejected": 0.5461426377296448, + "logps/chosen": -42.26673889160156, + "logps/rejected": -52.43321228027344, + "loss": 0.2582, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.38506922125816345, + "rewards/margins": 2.947833299636841, + "rewards/rejected": -3.332902431488037, + "step": 432 + }, + { + "epoch": 1.001666880369278, + "grad_norm": 14.705625802608846, + "learning_rate": 2.926395794644665e-07, + "logits/chosen": 0.5060461759567261, + "logits/rejected": 0.5222041010856628, + "logps/chosen": -45.8979606628418, + "logps/rejected": -55.48097229003906, + "loss": 0.1798, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.3213649392127991, + "rewards/margins": 3.302720308303833, + "rewards/rejected": -3.6240854263305664, + "step": 434 + }, + { + "epoch": 1.0062828567765099, + "grad_norm": 24.90302953634143, + "learning_rate": 2.906513329583991e-07, + "logits/chosen": 0.5120677351951599, + "logits/rejected": 0.5406749844551086, + "logps/chosen": -40.07225036621094, + "logps/rejected": -54.882259368896484, + "loss": 0.2186, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.5253066420555115, + "rewards/margins": 3.1281352043151855, + "rewards/rejected": -3.653441905975342, + "step": 436 + }, + { + "epoch": 1.0108988331837414, + "grad_norm": 20.006366802619794, + "learning_rate": 2.886604418689921e-07, + "logits/chosen": 0.48885577917099, + "logits/rejected": 0.5327137112617493, + "logps/chosen": -38.752708435058594, + "logps/rejected": -66.8874740600586, + "loss": 0.2705, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -0.5506837368011475, + "rewards/margins": 3.6388425827026367, + "rewards/rejected": -4.189526557922363, + "step": 438 + }, + { + "epoch": 1.0155148095909732, + "grad_norm": 11.538422039384988, + "learning_rate": 2.866670357141979e-07, + "logits/chosen": 0.5471632480621338, + "logits/rejected": 0.5706813931465149, + "logps/chosen": -44.1706428527832, + "logps/rejected": -54.80915832519531, + "loss": 0.2123, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.5128348469734192, + "rewards/margins": 3.5640437602996826, + "rewards/rejected": -4.076879024505615, + "step": 440 + }, + { + "epoch": 1.0155148095909732, + "eval_logits/chosen": 0.42714568972587585, + "eval_logits/rejected": 0.4533489942550659, + "eval_logps/chosen": -42.395565032958984, + "eval_logps/rejected": -54.934104919433594, + "eval_loss": 0.2539977729320526, + "eval_rewards/accuracies": 0.8231566548347473, + "eval_rewards/chosen": -0.3251444697380066, + "eval_rewards/margins": 3.122741937637329, + "eval_rewards/rejected": -3.4478864669799805, + "eval_runtime": 220.3559, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.97, + "step": 440 + }, + { + "epoch": 1.020130785998205, + "grad_norm": 16.119320288131345, + "learning_rate": 2.8467124417558737e-07, + "logits/chosen": 0.5559278130531311, + "logits/rejected": 0.5782606601715088, + "logps/chosen": -43.08287048339844, + "logps/rejected": -55.4886474609375, + "loss": 0.2118, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.22590351104736328, + "rewards/margins": 3.3553009033203125, + "rewards/rejected": -3.581204414367676, + "step": 442 + }, + { + "epoch": 1.0247467624054365, + "grad_norm": 21.10014479926061, + "learning_rate": 2.8267319708991253e-07, + "logits/chosen": 0.5570061206817627, + "logits/rejected": 0.5741885304450989, + "logps/chosen": -46.57928466796875, + "logps/rejected": -48.77629089355469, + "loss": 0.2203, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.2255779653787613, + "rewards/margins": 2.8583762645721436, + "rewards/rejected": -3.083954334259033, + "step": 444 + }, + { + "epoch": 1.0293627388126683, + "grad_norm": 21.99323071947427, + "learning_rate": 2.806730244406612e-07, + "logits/chosen": 0.5444987416267395, + "logits/rejected": 0.5731097459793091, + "logps/chosen": -40.73080825805664, + "logps/rejected": -52.80342102050781, + "loss": 0.2407, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.2986847758293152, + "rewards/margins": 3.0820257663726807, + "rewards/rejected": -3.3807103633880615, + "step": 446 + }, + { + "epoch": 1.0339787152199, + "grad_norm": 17.17450683483707, + "learning_rate": 2.786708563496001e-07, + "logits/chosen": 0.5541989207267761, + "logits/rejected": 0.5817456841468811, + "logps/chosen": -45.73213195800781, + "logps/rejected": -61.18666458129883, + "loss": 0.1772, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.05669987201690674, + "rewards/margins": 3.8165981769561768, + "rewards/rejected": -3.873298168182373, + "step": 448 + }, + { + "epoch": 1.0385946916271316, + "grad_norm": 27.653708636239905, + "learning_rate": 2.7666682306830994e-07, + "logits/chosen": 0.5207394957542419, + "logits/rejected": 0.5322983860969543, + "logps/chosen": -41.09166717529297, + "logps/rejected": -43.31468200683594, + "loss": 0.2544, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.3381701707839966, + "rewards/margins": 2.6456761360168457, + "rewards/rejected": -2.9838459491729736, + "step": 450 + }, + { + "epoch": 1.0385946916271316, + "eval_logits/chosen": 0.43128177523612976, + "eval_logits/rejected": 0.4573296308517456, + "eval_logps/chosen": -42.16498565673828, + "eval_logps/rejected": -54.75392150878906, + "eval_loss": 0.2521970570087433, + "eval_rewards/accuracies": 0.8248847723007202, + "eval_rewards/chosen": -0.20985357463359833, + "eval_rewards/margins": 3.147939920425415, + "eval_rewards/rejected": -3.3577938079833984, + "eval_runtime": 220.2887, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 450 + }, + { + "epoch": 1.0432106680343634, + "grad_norm": 26.863807248353726, + "learning_rate": 2.746610549697119e-07, + "logits/chosen": 0.5497666001319885, + "logits/rejected": 0.5746829509735107, + "logps/chosen": -42.95619583129883, + "logps/rejected": -57.17405700683594, + "loss": 0.2279, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.33137860894203186, + "rewards/margins": 3.0671894550323486, + "rewards/rejected": -3.3985676765441895, + "step": 452 + }, + { + "epoch": 1.0478266444415951, + "grad_norm": 15.765922708965844, + "learning_rate": 2.7265368253958615e-07, + "logits/chosen": 0.5027904510498047, + "logits/rejected": 0.5187773108482361, + "logps/chosen": -40.01198959350586, + "logps/rejected": -49.16390609741211, + "loss": 0.1826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024355987086892128, + "rewards/margins": 3.001004219055176, + "rewards/rejected": -3.025360107421875, + "step": 454 + }, + { + "epoch": 1.0524426208488267, + "grad_norm": 13.117750938407347, + "learning_rate": 2.706448363680831e-07, + "logits/chosen": 0.5505272746086121, + "logits/rejected": 0.592627763748169, + "logps/chosen": -40.86323928833008, + "logps/rejected": -65.0215072631836, + "loss": 0.1182, + "rewards/accuracies": 0.9722222089767456, + "rewards/chosen": -0.19750367105007172, + "rewards/margins": 4.092833995819092, + "rewards/rejected": -4.290337562561035, + "step": 456 + }, + { + "epoch": 1.0570585972560584, + "grad_norm": 16.896591758231867, + "learning_rate": 2.686346471412277e-07, + "logits/chosen": 0.4872972071170807, + "logits/rejected": 0.5277370810508728, + "logps/chosen": -44.69199752807617, + "logps/rejected": -65.82919311523438, + "loss": 0.1481, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.253704696893692, + "rewards/margins": 3.8575947284698486, + "rewards/rejected": -4.111299514770508, + "step": 458 + }, + { + "epoch": 1.0616745736632902, + "grad_norm": 20.974972760985903, + "learning_rate": 2.6662324563241805e-07, + "logits/chosen": 0.5082690119743347, + "logits/rejected": 0.5304160118103027, + "logps/chosen": -39.70173263549805, + "logps/rejected": -50.749732971191406, + "loss": 0.218, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.058096084743738174, + "rewards/margins": 2.925325632095337, + "rewards/rejected": -2.983421802520752, + "step": 460 + }, + { + "epoch": 1.0616745736632902, + "eval_logits/chosen": 0.42715081572532654, + "eval_logits/rejected": 0.45357510447502136, + "eval_logps/chosen": -41.917137145996094, + "eval_logps/rejected": -54.64493179321289, + "eval_loss": 0.2522634267807007, + "eval_rewards/accuracies": 0.8231566548347473, + "eval_rewards/chosen": -0.08592969179153442, + "eval_rewards/margins": 3.217369556427002, + "eval_rewards/rejected": -3.3032991886138916, + "eval_runtime": 220.2922, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 460 + }, + { + "epoch": 1.0662905500705218, + "grad_norm": 14.344965515087893, + "learning_rate": 2.6461076269391713e-07, + "logits/chosen": 0.5723965167999268, + "logits/rejected": 0.6080074310302734, + "logps/chosen": -47.22536087036133, + "logps/rejected": -63.04933166503906, + "loss": 0.1633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08401741087436676, + "rewards/margins": 4.024357318878174, + "rewards/rejected": -4.10837459564209, + "step": 462 + }, + { + "epoch": 1.0709065264777535, + "grad_norm": 22.161377940303407, + "learning_rate": 2.625973292483409e-07, + "logits/chosen": 0.49575677514076233, + "logits/rejected": 0.5175695419311523, + "logps/chosen": -49.86793518066406, + "logps/rejected": -61.0032958984375, + "loss": 0.2086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2437991052865982, + "rewards/margins": 3.3475723266601562, + "rewards/rejected": -3.5913712978363037, + "step": 464 + }, + { + "epoch": 1.0755225028849853, + "grad_norm": 9.157546830395537, + "learning_rate": 2.6058307628014065e-07, + "logits/chosen": 0.5648156404495239, + "logits/rejected": 0.5903113484382629, + "logps/chosen": -47.16014099121094, + "logps/rejected": -58.00987243652344, + "loss": 0.1681, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.20527897775173187, + "rewards/margins": 3.885181427001953, + "rewards/rejected": -4.090460777282715, + "step": 466 + }, + { + "epoch": 1.0801384792922168, + "grad_norm": 20.418800394750264, + "learning_rate": 2.5856813482708217e-07, + "logits/chosen": 0.5167273879051208, + "logits/rejected": 0.5341954827308655, + "logps/chosen": -44.03962707519531, + "logps/rejected": -48.64061737060547, + "loss": 0.205, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.06323742121458054, + "rewards/margins": 3.104510545730591, + "rewards/rejected": -3.041273355484009, + "step": 468 + }, + { + "epoch": 1.0847544556994486, + "grad_norm": 24.70628607742756, + "learning_rate": 2.565526359717206e-07, + "logits/chosen": 0.537581205368042, + "logits/rejected": 0.5596475005149841, + "logps/chosen": -37.46675109863281, + "logps/rejected": -45.9968147277832, + "loss": 0.3005, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.24194829165935516, + "rewards/margins": 2.6193909645080566, + "rewards/rejected": -2.8613390922546387, + "step": 470 + }, + { + "epoch": 1.0847544556994486, + "eval_logits/chosen": 0.4362466037273407, + "eval_logits/rejected": 0.4623866379261017, + "eval_logps/chosen": -42.15773010253906, + "eval_logps/rejected": -54.935401916503906, + "eval_loss": 0.24963192641735077, + "eval_rewards/accuracies": 0.8260368704795837, + "eval_rewards/chosen": -0.20622780919075012, + "eval_rewards/margins": 3.242306709289551, + "eval_rewards/rejected": -3.4485342502593994, + "eval_runtime": 220.4037, + "eval_samples_per_second": 7.867, + "eval_steps_per_second": 1.969, + "step": 470 + }, + { + "epoch": 1.0893704321066804, + "grad_norm": 27.430779359112005, + "learning_rate": 2.545367108328731e-07, + "logits/chosen": 0.5652859807014465, + "logits/rejected": 0.591205358505249, + "logps/chosen": -43.71979904174805, + "logps/rejected": -53.00830841064453, + "loss": 0.2156, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.20083469152450562, + "rewards/margins": 3.2087488174438477, + "rewards/rejected": -3.409583330154419, + "step": 472 + }, + { + "epoch": 1.0939864085139122, + "grad_norm": 13.134510140867176, + "learning_rate": 2.525204905570889e-07, + "logits/chosen": 0.5791910290718079, + "logits/rejected": 0.6038353443145752, + "logps/chosen": -46.998390197753906, + "logps/rejected": -59.18220520019531, + "loss": 0.1707, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.05355483293533325, + "rewards/margins": 3.5535666942596436, + "rewards/rejected": -3.607121706008911, + "step": 474 + }, + { + "epoch": 1.0986023849211437, + "grad_norm": 19.90392742325827, + "learning_rate": 2.505041063101171e-07, + "logits/chosen": 0.5816848278045654, + "logits/rejected": 0.6008831858634949, + "logps/chosen": -47.19880676269531, + "logps/rejected": -51.822105407714844, + "loss": 0.2218, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03883904218673706, + "rewards/margins": 3.348583221435547, + "rewards/rejected": -3.309744358062744, + "step": 476 + }, + { + "epoch": 1.1032183613283755, + "grad_norm": 17.00116980477646, + "learning_rate": 2.4848768926837466e-07, + "logits/chosen": 0.5338962078094482, + "logits/rejected": 0.5906614065170288, + "logps/chosen": -40.04157257080078, + "logps/rejected": -76.84749603271484, + "loss": 0.1893, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1601162701845169, + "rewards/margins": 4.218037128448486, + "rewards/rejected": -4.378152847290039, + "step": 478 + }, + { + "epoch": 1.107834337735607, + "grad_norm": 15.038557815597683, + "learning_rate": 2.464713706104113e-07, + "logits/chosen": 0.5352125763893127, + "logits/rejected": 0.5612537264823914, + "logps/chosen": -43.91660690307617, + "logps/rejected": -56.44979476928711, + "loss": 0.1633, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.2793487310409546, + "rewards/margins": 3.6175765991210938, + "rewards/rejected": -3.896925210952759, + "step": 480 + }, + { + "epoch": 1.107834337735607, + "eval_logits/chosen": 0.43004509806632996, + "eval_logits/rejected": 0.4563468098640442, + "eval_logps/chosen": -42.171958923339844, + "eval_logps/rejected": -54.986507415771484, + "eval_loss": 0.24832715094089508, + "eval_rewards/accuracies": 0.8271889686584473, + "eval_rewards/chosen": -0.21334028244018555, + "eval_rewards/margins": 3.2607483863830566, + "eval_rewards/rejected": -3.474088668823242, + "eval_runtime": 220.2251, + "eval_samples_per_second": 7.874, + "eval_steps_per_second": 1.971, + "step": 480 + }, + { + "epoch": 1.1124503141428388, + "grad_norm": 22.9744657106464, + "learning_rate": 2.444552815083767e-07, + "logits/chosen": 0.6254298686981201, + "logits/rejected": 0.6373676061630249, + "logps/chosen": -42.673282623291016, + "logps/rejected": -45.563087463378906, + "loss": 0.2114, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.057508740574121475, + "rewards/margins": 3.0235791206359863, + "rewards/rejected": -3.081087350845337, + "step": 482 + }, + { + "epoch": 1.1170662905500706, + "grad_norm": 17.674691508042564, + "learning_rate": 2.4243955311948693e-07, + "logits/chosen": 0.5245480537414551, + "logits/rejected": 0.5648095011711121, + "logps/chosen": -39.3298225402832, + "logps/rejected": -61.31127166748047, + "loss": 0.2236, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.1908557116985321, + "rewards/margins": 3.677870512008667, + "rewards/rejected": -3.8687260150909424, + "step": 484 + }, + { + "epoch": 1.1216822669573023, + "grad_norm": 19.4717194397301, + "learning_rate": 2.4042431657749115e-07, + "logits/chosen": 0.585620105266571, + "logits/rejected": 0.6345695853233337, + "logps/chosen": -41.645267486572266, + "logps/rejected": -72.78955078125, + "loss": 0.1703, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.20467931032180786, + "rewards/margins": 4.08174991607666, + "rewards/rejected": -4.286429405212402, + "step": 486 + }, + { + "epoch": 1.1262982433645339, + "grad_norm": 30.909727917565508, + "learning_rate": 2.384097029841419e-07, + "logits/chosen": 0.4901224672794342, + "logits/rejected": 0.5071887969970703, + "logps/chosen": -43.30605697631836, + "logps/rejected": -50.992618560791016, + "loss": 0.2185, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.18728405237197876, + "rewards/margins": 2.9479784965515137, + "rewards/rejected": -3.1352624893188477, + "step": 488 + }, + { + "epoch": 1.1309142197717656, + "grad_norm": 16.93415094151409, + "learning_rate": 2.3639584340066544e-07, + "logits/chosen": 0.5211553573608398, + "logits/rejected": 0.5518543124198914, + "logps/chosen": -37.83938980102539, + "logps/rejected": -53.91053009033203, + "loss": 0.234, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.05988183990120888, + "rewards/margins": 3.5345206260681152, + "rewards/rejected": -3.4746387004852295, + "step": 490 + }, + { + "epoch": 1.1309142197717656, + "eval_logits/chosen": 0.43326738476753235, + "eval_logits/rejected": 0.45958051085472107, + "eval_logps/chosen": -41.84520721435547, + "eval_logps/rejected": -54.6281852722168, + "eval_loss": 0.24792973697185516, + "eval_rewards/accuracies": 0.8220046162605286, + "eval_rewards/chosen": -0.04996471852064133, + "eval_rewards/margins": 3.244964361190796, + "eval_rewards/rejected": -3.294929265975952, + "eval_runtime": 220.3046, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 490 + }, + { + "epoch": 1.1355301961789972, + "grad_norm": 16.790260075155444, + "learning_rate": 2.3438286883923539e-07, + "logits/chosen": 0.5881079435348511, + "logits/rejected": 0.6105315685272217, + "logps/chosen": -46.794837951660156, + "logps/rejected": -53.43986511230469, + "loss": 0.2269, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.08306831121444702, + "rewards/margins": 3.1719002723693848, + "rewards/rejected": -3.088831663131714, + "step": 492 + }, + { + "epoch": 1.140146172586229, + "grad_norm": 22.957641710400285, + "learning_rate": 2.323709102544506e-07, + "logits/chosen": 0.6002509593963623, + "logits/rejected": 0.6072889566421509, + "logps/chosen": -39.66600036621094, + "logps/rejected": -41.07653045654297, + "loss": 0.2857, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": 0.20397840440273285, + "rewards/margins": 2.4769766330718994, + "rewards/rejected": -2.272998094558716, + "step": 494 + }, + { + "epoch": 1.1447621489934607, + "grad_norm": 27.504424003065566, + "learning_rate": 2.3036009853481474e-07, + "logits/chosen": 0.5301830768585205, + "logits/rejected": 0.5608452558517456, + "logps/chosen": -39.39542770385742, + "logps/rejected": -58.36659622192383, + "loss": 0.2681, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.2189822793006897, + "rewards/margins": 3.4378933906555176, + "rewards/rejected": -3.6568756103515625, + "step": 496 + }, + { + "epoch": 1.1493781254006925, + "grad_norm": 16.835368907101664, + "learning_rate": 2.283505644942223e-07, + "logits/chosen": 0.5190525054931641, + "logits/rejected": 0.5493537783622742, + "logps/chosen": -34.43808364868164, + "logps/rejected": -54.84063720703125, + "loss": 0.1937, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.13352231681346893, + "rewards/margins": 3.440141201019287, + "rewards/rejected": -3.3066186904907227, + "step": 498 + }, + { + "epoch": 1.153994101807924, + "grad_norm": 14.320814422051418, + "learning_rate": 2.2634243886344781e-07, + "logits/chosen": 0.5132643580436707, + "logits/rejected": 0.5381724834442139, + "logps/chosen": -41.94618225097656, + "logps/rejected": -54.74879455566406, + "loss": 0.243, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.1846380978822708, + "rewards/margins": 3.523959159851074, + "rewards/rejected": -3.3393211364746094, + "step": 500 + }, + { + "epoch": 1.153994101807924, + "eval_logits/chosen": 0.43241602182388306, + "eval_logits/rejected": 0.45862025022506714, + "eval_logps/chosen": -41.512245178222656, + "eval_logps/rejected": -54.365325927734375, + "eval_loss": 0.24479356408119202, + "eval_rewards/accuracies": 0.8289170265197754, + "eval_rewards/chosen": 0.11651827394962311, + "eval_rewards/margins": 3.2800135612487793, + "eval_rewards/rejected": -3.1634950637817383, + "eval_runtime": 220.3257, + "eval_samples_per_second": 7.87, + "eval_steps_per_second": 1.97, + "step": 500 + }, + { + "epoch": 1.1586100782151558, + "grad_norm": 17.24901468893502, + "learning_rate": 2.2433585228164115e-07, + "logits/chosen": 0.5386977791786194, + "logits/rejected": 0.5774834156036377, + "logps/chosen": -43.753910064697266, + "logps/rejected": -65.60494232177734, + "loss": 0.1918, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.19071653485298157, + "rewards/margins": 4.159061908721924, + "rewards/rejected": -3.9683446884155273, + "step": 502 + }, + { + "epoch": 1.1632260546223874, + "grad_norm": 22.994462305856853, + "learning_rate": 2.2233093528782938e-07, + "logits/chosen": 0.5429908037185669, + "logits/rejected": 0.5663915872573853, + "logps/chosen": -49.295047760009766, + "logps/rejected": -58.83778381347656, + "loss": 0.1741, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.27108439803123474, + "rewards/margins": 3.4974775314331055, + "rewards/rejected": -3.226392984390259, + "step": 504 + }, + { + "epoch": 1.1678420310296191, + "grad_norm": 19.749474882703815, + "learning_rate": 2.2032781831242367e-07, + "logits/chosen": 0.5360143184661865, + "logits/rejected": 0.5641200542449951, + "logps/chosen": -35.82609558105469, + "logps/rejected": -44.779361724853516, + "loss": 0.2253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4115668535232544, + "rewards/margins": 2.9376118183135986, + "rewards/rejected": -2.526045083999634, + "step": 506 + }, + { + "epoch": 1.172458007436851, + "grad_norm": 29.881557534524536, + "learning_rate": 2.183266316687347e-07, + "logits/chosen": 0.5799429416656494, + "logits/rejected": 0.5963388681411743, + "logps/chosen": -42.11252975463867, + "logps/rejected": -44.56486511230469, + "loss": 0.2905, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": 0.2770005762577057, + "rewards/margins": 2.54060435295105, + "rewards/rejected": -2.263603687286377, + "step": 508 + }, + { + "epoch": 1.1770739838440827, + "grad_norm": 11.72889590765659, + "learning_rate": 2.16327505544495e-07, + "logits/chosen": 0.5231108069419861, + "logits/rejected": 0.5499060153961182, + "logps/chosen": -43.436798095703125, + "logps/rejected": -57.92034912109375, + "loss": 0.1472, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.47280406951904297, + "rewards/margins": 4.098244667053223, + "rewards/rejected": -3.625440835952759, + "step": 510 + }, + { + "epoch": 1.1770739838440827, + "eval_logits/chosen": 0.43323588371276855, + "eval_logits/rejected": 0.4594508111476898, + "eval_logps/chosen": -41.14154815673828, + "eval_logps/rejected": -54.075172424316406, + "eval_loss": 0.247583270072937, + "eval_rewards/accuracies": 0.828341007232666, + "eval_rewards/chosen": 0.30186572670936584, + "eval_rewards/margins": 3.3202853202819824, + "eval_rewards/rejected": -3.0184197425842285, + "eval_runtime": 220.3645, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.969, + "step": 510 + }, + { + "epoch": 1.1816899602513142, + "grad_norm": 19.02915371887465, + "learning_rate": 2.143305699933892e-07, + "logits/chosen": 0.5309435725212097, + "logits/rejected": 0.5609121322631836, + "logps/chosen": -39.10821533203125, + "logps/rejected": -55.85133743286133, + "loss": 0.2148, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.35923802852630615, + "rewards/margins": 3.6412789821624756, + "rewards/rejected": -3.282041549682617, + "step": 512 + }, + { + "epoch": 1.186305936658546, + "grad_norm": 18.184730820886717, + "learning_rate": 2.1233595492659382e-07, + "logits/chosen": 0.6312618851661682, + "logits/rejected": 0.6453579068183899, + "logps/chosen": -48.93413543701172, + "logps/rejected": -50.58020782470703, + "loss": 0.1701, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.28959882259368896, + "rewards/margins": 3.4992854595184326, + "rewards/rejected": -3.209686040878296, + "step": 514 + }, + { + "epoch": 1.1909219130657775, + "grad_norm": 21.115621604290848, + "learning_rate": 2.1034379010432542e-07, + "logits/chosen": 0.5738712549209595, + "logits/rejected": 0.5990296006202698, + "logps/chosen": -36.4149055480957, + "logps/rejected": -47.95274353027344, + "loss": 0.2192, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.35762646794319153, + "rewards/margins": 3.1450395584106445, + "rewards/rejected": -2.7874133586883545, + "step": 516 + }, + { + "epoch": 1.1955378894730093, + "grad_norm": 18.313049973835163, + "learning_rate": 2.0835420512739957e-07, + "logits/chosen": 0.48849010467529297, + "logits/rejected": 0.5418619513511658, + "logps/chosen": -39.52627182006836, + "logps/rejected": -70.53701782226562, + "loss": 0.1678, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.39579084515571594, + "rewards/margins": 4.528857231140137, + "rewards/rejected": -4.133066654205322, + "step": 518 + }, + { + "epoch": 1.200153865880241, + "grad_norm": 18.512425100692376, + "learning_rate": 2.0636732942879917e-07, + "logits/chosen": 0.5643823146820068, + "logits/rejected": 0.5917804837226868, + "logps/chosen": -43.44633483886719, + "logps/rejected": -56.26163101196289, + "loss": 0.166, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.33819130063056946, + "rewards/margins": 3.693488121032715, + "rewards/rejected": -3.3552963733673096, + "step": 520 + }, + { + "epoch": 1.200153865880241, + "eval_logits/chosen": 0.4335879981517792, + "eval_logits/rejected": 0.45994046330451965, + "eval_logps/chosen": -41.402774810791016, + "eval_logps/rejected": -54.35234451293945, + "eval_loss": 0.2449788749217987, + "eval_rewards/accuracies": 0.8317972421646118, + "eval_rewards/chosen": 0.1712525486946106, + "eval_rewards/margins": 3.328258991241455, + "eval_rewards/rejected": -3.1570065021514893, + "eval_runtime": 220.2998, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 520 + }, + { + "epoch": 1.2047698422874729, + "grad_norm": 11.696545134195471, + "learning_rate": 2.0438329226525415e-07, + "logits/chosen": 0.5642399787902832, + "logits/rejected": 0.587860643863678, + "logps/chosen": -41.212337493896484, + "logps/rejected": -43.521636962890625, + "loss": 0.2246, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.5518161058425903, + "rewards/margins": 2.9677634239196777, + "rewards/rejected": -2.415947675704956, + "step": 522 + }, + { + "epoch": 1.2093858186947044, + "grad_norm": 24.196902238001236, + "learning_rate": 2.0240222270883288e-07, + "logits/chosen": 0.5227870941162109, + "logits/rejected": 0.5579611659049988, + "logps/chosen": -44.49864196777344, + "logps/rejected": -64.84123229980469, + "loss": 0.2314, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.16053809225559235, + "rewards/margins": 3.896054267883301, + "rewards/rejected": -3.73551607131958, + "step": 524 + }, + { + "epoch": 1.2140017951019362, + "grad_norm": 12.971615376216704, + "learning_rate": 2.0042424963854542e-07, + "logits/chosen": 0.5063973665237427, + "logits/rejected": 0.5544097423553467, + "logps/chosen": -40.40736389160156, + "logps/rejected": -70.9152603149414, + "loss": 0.1526, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": 0.3248124122619629, + "rewards/margins": 4.234506607055664, + "rewards/rejected": -3.9096946716308594, + "step": 526 + }, + { + "epoch": 1.2186177715091677, + "grad_norm": 14.0866861852398, + "learning_rate": 1.9844950173195883e-07, + "logits/chosen": 0.5182596445083618, + "logits/rejected": 0.549498975276947, + "logps/chosen": -39.39563751220703, + "logps/rejected": -54.05485153198242, + "loss": 0.1818, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.22824376821517944, + "rewards/margins": 3.397740364074707, + "rewards/rejected": -3.169497013092041, + "step": 528 + }, + { + "epoch": 1.2232337479163995, + "grad_norm": 13.76916365285817, + "learning_rate": 1.964781074568265e-07, + "logits/chosen": 0.5031299591064453, + "logits/rejected": 0.5121724009513855, + "logps/chosen": -41.18108367919922, + "logps/rejected": -45.627994537353516, + "loss": 0.1945, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.03019801713526249, + "rewards/margins": 3.0934128761291504, + "rewards/rejected": -3.0632145404815674, + "step": 530 + }, + { + "epoch": 1.2232337479163995, + "eval_logits/chosen": 0.43405523896217346, + "eval_logits/rejected": 0.46039465069770813, + "eval_logps/chosen": -41.60369110107422, + "eval_logps/rejected": -54.51262664794922, + "eval_loss": 0.24258121848106384, + "eval_rewards/accuracies": 0.8335253596305847, + "eval_rewards/chosen": 0.07079467922449112, + "eval_rewards/margins": 3.3079416751861572, + "eval_rewards/rejected": -3.2371468544006348, + "eval_runtime": 220.2641, + "eval_samples_per_second": 7.872, + "eval_steps_per_second": 1.97, + "step": 530 + }, + { + "epoch": 1.2278497243236313, + "grad_norm": 16.411903473780164, + "learning_rate": 1.9451019506273018e-07, + "logits/chosen": 0.541588306427002, + "logits/rejected": 0.5615941286087036, + "logps/chosen": -36.563297271728516, + "logps/rejected": -48.32072448730469, + "loss": 0.2351, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.17822687327861786, + "rewards/margins": 2.845065116882324, + "rewards/rejected": -2.6668384075164795, + "step": 532 + }, + { + "epoch": 1.232465700730863, + "grad_norm": 13.467269631637619, + "learning_rate": 1.9254589257273712e-07, + "logits/chosen": 0.5137292146682739, + "logits/rejected": 0.5505712032318115, + "logps/chosen": -36.598384857177734, + "logps/rejected": -57.48229217529297, + "loss": 0.1473, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.19568167626857758, + "rewards/margins": 4.128161907196045, + "rewards/rejected": -3.9324798583984375, + "step": 534 + }, + { + "epoch": 1.2370816771380946, + "grad_norm": 24.645788661655104, + "learning_rate": 1.9058532777507141e-07, + "logits/chosen": 0.5294635891914368, + "logits/rejected": 0.5472697615623474, + "logps/chosen": -39.22220230102539, + "logps/rejected": -49.91395950317383, + "loss": 0.2172, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.11992057412862778, + "rewards/margins": 3.224815845489502, + "rewards/rejected": -3.1048953533172607, + "step": 536 + }, + { + "epoch": 1.2416976535453264, + "grad_norm": 18.291984511184836, + "learning_rate": 1.886286282148002e-07, + "logits/chosen": 0.5298857688903809, + "logits/rejected": 0.5633623600006104, + "logps/chosen": -41.294647216796875, + "logps/rejected": -57.79304885864258, + "loss": 0.2731, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.11145944148302078, + "rewards/margins": 3.1801443099975586, + "rewards/rejected": -3.2916040420532227, + "step": 538 + }, + { + "epoch": 1.246313629952558, + "grad_norm": 17.71916747448851, + "learning_rate": 1.8667592118553693e-07, + "logits/chosen": 0.5349301099777222, + "logits/rejected": 0.5512058734893799, + "logps/chosen": -43.72676467895508, + "logps/rejected": -52.80296325683594, + "loss": 0.2216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03689540922641754, + "rewards/margins": 3.2271673679351807, + "rewards/rejected": -3.2640628814697266, + "step": 540 + }, + { + "epoch": 1.246313629952558, + "eval_logits/chosen": 0.4325529932975769, + "eval_logits/rejected": 0.45892781019210815, + "eval_logps/chosen": -41.67875289916992, + "eval_logps/rejected": -54.59620666503906, + "eval_loss": 0.24205271899700165, + "eval_rewards/accuracies": 0.8277649879455566, + "eval_rewards/chosen": 0.03326287120580673, + "eval_rewards/margins": 3.312199115753174, + "eval_rewards/rejected": -3.2789359092712402, + "eval_runtime": 220.1774, + "eval_samples_per_second": 7.875, + "eval_steps_per_second": 1.971, + "step": 540 + }, + { + "epoch": 1.2509296063597897, + "grad_norm": 15.1063531754732, + "learning_rate": 1.8472733372115956e-07, + "logits/chosen": 0.4958040416240692, + "logits/rejected": 0.5259097814559937, + "logps/chosen": -43.43186950683594, + "logps/rejected": -60.27039337158203, + "loss": 0.1823, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.40173831582069397, + "rewards/margins": 3.8025894165039062, + "rewards/rejected": -4.2043280601501465, + "step": 542 + }, + { + "epoch": 1.2555455827670214, + "grad_norm": 23.60965925798032, + "learning_rate": 1.8278299258754692e-07, + "logits/chosen": 0.47050708532333374, + "logits/rejected": 0.5154716968536377, + "logps/chosen": -43.42805480957031, + "logps/rejected": -71.56327056884766, + "loss": 0.2284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35217729210853577, + "rewards/margins": 4.311697483062744, + "rewards/rejected": -4.663875102996826, + "step": 544 + }, + { + "epoch": 1.2601615591742532, + "grad_norm": 11.785150141913245, + "learning_rate": 1.808430242743316e-07, + "logits/chosen": 0.46195343136787415, + "logits/rejected": 0.4784909784793854, + "logps/chosen": -42.974945068359375, + "logps/rejected": -54.21615219116211, + "loss": 0.1867, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.31646448373794556, + "rewards/margins": 3.5641021728515625, + "rewards/rejected": -3.2476377487182617, + "step": 546 + }, + { + "epoch": 1.2647775355814848, + "grad_norm": 13.346160813344762, + "learning_rate": 1.7890755498667104e-07, + "logits/chosen": 0.5626040101051331, + "logits/rejected": 0.5980097651481628, + "logps/chosen": -36.59039306640625, + "logps/rejected": -55.57601547241211, + "loss": 0.182, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.17459021508693695, + "rewards/margins": 3.451416015625, + "rewards/rejected": -3.2768259048461914, + "step": 548 + }, + { + "epoch": 1.2693935119887165, + "grad_norm": 25.621843956328824, + "learning_rate": 1.7697671063703756e-07, + "logits/chosen": 0.5085393786430359, + "logits/rejected": 0.5440909266471863, + "logps/chosen": -39.27238464355469, + "logps/rejected": -59.40525817871094, + "loss": 0.2243, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.011964095756411552, + "rewards/margins": 3.6004185676574707, + "rewards/rejected": -3.588454246520996, + "step": 550 + }, + { + "epoch": 1.2693935119887165, + "eval_logits/chosen": 0.4355390965938568, + "eval_logits/rejected": 0.46181005239486694, + "eval_logps/chosen": -41.701602935791016, + "eval_logps/rejected": -54.663360595703125, + "eval_loss": 0.24010230600833893, + "eval_rewards/accuracies": 0.8260368704795837, + "eval_rewards/chosen": 0.0218377523124218, + "eval_rewards/margins": 3.3343515396118164, + "eval_rewards/rejected": -3.312513828277588, + "eval_runtime": 220.234, + "eval_samples_per_second": 7.873, + "eval_steps_per_second": 1.971, + "step": 550 + }, + { + "epoch": 1.274009488395948, + "grad_norm": 29.85339571581757, + "learning_rate": 1.750506168370267e-07, + "logits/chosen": 0.5484946370124817, + "logits/rejected": 0.5642725229263306, + "logps/chosen": -40.738338470458984, + "logps/rejected": -47.2222900390625, + "loss": 0.2665, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.49148619174957275, + "rewards/margins": 3.0378835201263428, + "rewards/rejected": -2.5463972091674805, + "step": 552 + }, + { + "epoch": 1.2786254648031798, + "grad_norm": 11.606234417897845, + "learning_rate": 1.7312939888918594e-07, + "logits/chosen": 0.5540368556976318, + "logits/rejected": 0.5830137729644775, + "logps/chosen": -43.42100143432617, + "logps/rejected": -63.07583999633789, + "loss": 0.1529, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": 0.060752179473638535, + "rewards/margins": 3.951368570327759, + "rewards/rejected": -3.8906164169311523, + "step": 554 + }, + { + "epoch": 1.2832414412104116, + "grad_norm": 8.195981315855988, + "learning_rate": 1.712131817788628e-07, + "logits/chosen": 0.5598903298377991, + "logits/rejected": 0.582931637763977, + "logps/chosen": -39.05931854248047, + "logps/rejected": -49.5858154296875, + "loss": 0.2278, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.035774629563093185, + "rewards/margins": 3.2900662422180176, + "rewards/rejected": -3.325840950012207, + "step": 556 + }, + { + "epoch": 1.2878574176176434, + "grad_norm": 10.58953396876903, + "learning_rate": 1.693020901660738e-07, + "logits/chosen": 0.5586022138595581, + "logits/rejected": 0.5835521221160889, + "logps/chosen": -46.566070556640625, + "logps/rejected": -56.1746940612793, + "loss": 0.1347, + "rewards/accuracies": 0.9583333134651184, + "rewards/chosen": 0.1323520541191101, + "rewards/margins": 3.951080322265625, + "rewards/rejected": -3.81872820854187, + "step": 558 + }, + { + "epoch": 1.292473394024875, + "grad_norm": 20.647672350132265, + "learning_rate": 1.6739624837739518e-07, + "logits/chosen": 0.4893258512020111, + "logits/rejected": 0.5065658092498779, + "logps/chosen": -46.70867919921875, + "logps/rejected": -53.02800369262695, + "loss": 0.2073, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.15436476469039917, + "rewards/margins": 3.050819158554077, + "rewards/rejected": -3.205183744430542, + "step": 560 + }, + { + "epoch": 1.292473394024875, + "eval_logits/chosen": 0.4335208237171173, + "eval_logits/rejected": 0.45989227294921875, + "eval_logps/chosen": -41.82432556152344, + "eval_logps/rejected": -54.859825134277344, + "eval_loss": 0.23924875259399414, + "eval_rewards/accuracies": 0.8312212228775024, + "eval_rewards/chosen": -0.03952277451753616, + "eval_rewards/margins": 3.371224880218506, + "eval_rewards/rejected": -3.410747766494751, + "eval_runtime": 220.3082, + "eval_samples_per_second": 7.871, + "eval_steps_per_second": 1.97, + "step": 560 + }, + { + "epoch": 1.2970893704321067, + "grad_norm": 15.328848187023517, + "learning_rate": 1.6549578039787434e-07, + "logits/chosen": 0.5223647356033325, + "logits/rejected": 0.5576710104942322, + "logps/chosen": -43.448875427246094, + "logps/rejected": -67.14339447021484, + "loss": 0.2405, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.1683083474636078, + "rewards/margins": 3.6626782417297363, + "rewards/rejected": -3.830986499786377, + "step": 562 + }, + { + "epoch": 1.3017053468393383, + "grad_norm": 14.362719389125761, + "learning_rate": 1.6360080986296384e-07, + "logits/chosen": 0.5163556337356567, + "logits/rejected": 0.5569749474525452, + "logps/chosen": -37.78327941894531, + "logps/rejected": -64.23339080810547, + "loss": 0.186, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.047993943095207214, + "rewards/margins": 4.109629154205322, + "rewards/rejected": -4.157623291015625, + "step": 564 + }, + { + "epoch": 1.30632132324657, + "grad_norm": 8.849930925918736, + "learning_rate": 1.6171146005047894e-07, + "logits/chosen": 0.5622715353965759, + "logits/rejected": 0.5891626477241516, + "logps/chosen": -46.50107955932617, + "logps/rejected": -63.37003707885742, + "loss": 0.1689, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.1092919185757637, + "rewards/margins": 4.0769548416137695, + "rewards/rejected": -3.967662811279297, + "step": 566 + }, + { + "epoch": 1.3109372996538018, + "grad_norm": 16.110148125770678, + "learning_rate": 1.5982785387257694e-07, + "logits/chosen": 0.5649956464767456, + "logits/rejected": 0.5782197117805481, + "logps/chosen": -43.4311408996582, + "logps/rejected": -49.03315734863281, + "loss": 0.2002, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.15342091023921967, + "rewards/margins": 2.909942150115967, + "rewards/rejected": -3.0633630752563477, + "step": 568 + }, + { + "epoch": 1.3155532760610336, + "grad_norm": 23.725153045927403, + "learning_rate": 1.5795011386776159e-07, + "logits/chosen": 0.5103439688682556, + "logits/rejected": 0.5300507545471191, + "logps/chosen": -42.80021667480469, + "logps/rejected": -47.7119255065918, + "loss": 0.2255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14677530527114868, + "rewards/margins": 3.0557618141174316, + "rewards/rejected": -3.2025370597839355, + "step": 570 + }, + { + "epoch": 1.3155532760610336, + "eval_logits/chosen": 0.43335986137390137, + "eval_logits/rejected": 0.4598417580127716, + "eval_logps/chosen": -41.851680755615234, + "eval_logps/rejected": -54.97309112548828, + "eval_loss": 0.23906731605529785, + "eval_rewards/accuracies": 0.835829496383667, + "eval_rewards/chosen": -0.05320117622613907, + "eval_rewards/margins": 3.4141783714294434, + "eval_rewards/rejected": -3.467379570007324, + "eval_runtime": 220.3588, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.97, + "step": 570 + }, + { + "epoch": 1.320169252468265, + "grad_norm": 16.172756609459842, + "learning_rate": 1.560783621929113e-07, + "logits/chosen": 0.5175637006759644, + "logits/rejected": 0.5229324102401733, + "logps/chosen": -49.446102142333984, + "logps/rejected": -55.164894104003906, + "loss": 0.1869, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": 0.08015252649784088, + "rewards/margins": 3.3609066009521484, + "rewards/rejected": -3.2807538509368896, + "step": 572 + }, + { + "epoch": 1.3247852288754969, + "grad_norm": 24.72268513177688, + "learning_rate": 1.5421272061533177e-07, + "logits/chosen": 0.5066720247268677, + "logits/rejected": 0.5451788306236267, + "logps/chosen": -37.343570709228516, + "logps/rejected": -60.23046112060547, + "loss": 0.2949, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": 0.1486133188009262, + "rewards/margins": 3.3898818492889404, + "rewards/rejected": -3.2412681579589844, + "step": 574 + }, + { + "epoch": 1.3294012052827284, + "grad_norm": 18.734543272703554, + "learning_rate": 1.5235331050483513e-07, + "logits/chosen": 0.5524860620498657, + "logits/rejected": 0.5772072672843933, + "logps/chosen": -43.33749771118164, + "logps/rejected": -56.5976676940918, + "loss": 0.2367, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.3428743779659271, + "rewards/margins": 3.3112895488739014, + "rewards/rejected": -3.6541638374328613, + "step": 576 + }, + { + "epoch": 1.3340171816899602, + "grad_norm": 15.636365920242639, + "learning_rate": 1.5050025282584327e-07, + "logits/chosen": 0.5805926322937012, + "logits/rejected": 0.6090676188468933, + "logps/chosen": -49.13417434692383, + "logps/rejected": -64.1076431274414, + "loss": 0.1791, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.08167193830013275, + "rewards/margins": 3.955726146697998, + "rewards/rejected": -4.037397861480713, + "step": 578 + }, + { + "epoch": 1.338633158097192, + "grad_norm": 15.524132351808905, + "learning_rate": 1.4865366812951921e-07, + "logits/chosen": 0.598872721195221, + "logits/rejected": 0.62497878074646, + "logps/chosen": -36.58146667480469, + "logps/rejected": -46.25484085083008, + "loss": 0.1893, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.01747778430581093, + "rewards/margins": 3.4903595447540283, + "rewards/rejected": -3.5078377723693848, + "step": 580 + }, + { + "epoch": 1.338633158097192, + "eval_logits/chosen": 0.4342789053916931, + "eval_logits/rejected": 0.46078288555145264, + "eval_logps/chosen": -42.1205940246582, + "eval_logps/rejected": -55.25835418701172, + "eval_loss": 0.2389531433582306, + "eval_rewards/accuracies": 0.8352534770965576, + "eval_rewards/chosen": -0.18765874207019806, + "eval_rewards/margins": 3.4223523139953613, + "eval_rewards/rejected": -3.610011339187622, + "eval_runtime": 220.361, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.969, + "step": 580 + }, + { + "epoch": 1.3432491345044237, + "grad_norm": 22.418640332294185, + "learning_rate": 1.4681367654592446e-07, + "logits/chosen": 0.583182692527771, + "logits/rejected": 0.596510112285614, + "logps/chosen": -45.08745574951172, + "logps/rejected": -52.57502746582031, + "loss": 0.1635, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.24202129244804382, + "rewards/margins": 3.0601682662963867, + "rewards/rejected": -3.302189826965332, + "step": 582 + }, + { + "epoch": 1.3478651109116553, + "grad_norm": 16.477398466397805, + "learning_rate": 1.4498039777620353e-07, + "logits/chosen": 0.5257098078727722, + "logits/rejected": 0.5561378598213196, + "logps/chosen": -49.92831039428711, + "logps/rejected": -66.70814514160156, + "loss": 0.1983, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": 0.07970259338617325, + "rewards/margins": 4.159069538116455, + "rewards/rejected": -4.079366683959961, + "step": 584 + }, + { + "epoch": 1.352481087318887, + "grad_norm": 21.638282072644653, + "learning_rate": 1.4315395108479728e-07, + "logits/chosen": 0.5448426008224487, + "logits/rejected": 0.5733739733695984, + "logps/chosen": -42.567203521728516, + "logps/rejected": -59.23841094970703, + "loss": 0.1872, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.3566977083683014, + "rewards/margins": 3.441741943359375, + "rewards/rejected": -3.7984399795532227, + "step": 586 + }, + { + "epoch": 1.3570970637261186, + "grad_norm": 22.386629994354788, + "learning_rate": 1.4133445529168365e-07, + "logits/chosen": 0.5482079982757568, + "logits/rejected": 0.5674624443054199, + "logps/chosen": -47.31834030151367, + "logps/rejected": -59.47747802734375, + "loss": 0.1735, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -0.25350263714790344, + "rewards/margins": 3.711785316467285, + "rewards/rejected": -3.965287923812866, + "step": 588 + }, + { + "epoch": 1.3617130401333504, + "grad_norm": 14.716672759245373, + "learning_rate": 1.395220287646483e-07, + "logits/chosen": 0.5413531064987183, + "logits/rejected": 0.5619943141937256, + "logps/chosen": -45.74396514892578, + "logps/rejected": -54.50990295410156, + "loss": 0.1609, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.3855374753475189, + "rewards/margins": 3.439289093017578, + "rewards/rejected": -3.82482647895813, + "step": 590 + }, + { + "epoch": 1.3617130401333504, + "eval_logits/chosen": 0.43462061882019043, + "eval_logits/rejected": 0.461076945066452, + "eval_logps/chosen": -42.448509216308594, + "eval_logps/rejected": -55.58904266357422, + "eval_loss": 0.2393806427717209, + "eval_rewards/accuracies": 0.8317972421646118, + "eval_rewards/chosen": -0.3516136407852173, + "eval_rewards/margins": 3.4237425327301025, + "eval_rewards/rejected": -3.7753562927246094, + "eval_runtime": 220.4141, + "eval_samples_per_second": 7.867, + "eval_steps_per_second": 1.969, + "step": 590 + }, + { + "epoch": 1.3663290165405821, + "grad_norm": 21.200823940085225, + "learning_rate": 1.377167894115837e-07, + "logits/chosen": 0.562565803527832, + "logits/rejected": 0.6183031797409058, + "logps/chosen": -38.32450866699219, + "logps/rejected": -68.53689575195312, + "loss": 0.179, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.12118612229824066, + "rewards/margins": 4.04473876953125, + "rewards/rejected": -4.165925025939941, + "step": 592 + }, + { + "epoch": 1.370944992947814, + "grad_norm": 13.082922810935031, + "learning_rate": 1.3591885467281877e-07, + "logits/chosen": 0.4695725440979004, + "logits/rejected": 0.4965362548828125, + "logps/chosen": -39.13195037841797, + "logps/rejected": -58.23176574707031, + "loss": 0.1861, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.30562350153923035, + "rewards/margins": 3.781522035598755, + "rewards/rejected": -4.0871453285217285, + "step": 594 + }, + { + "epoch": 1.3755609693550455, + "grad_norm": 34.97692684836387, + "learning_rate": 1.3412834151347896e-07, + "logits/chosen": 0.5469548106193542, + "logits/rejected": 0.5717971324920654, + "logps/chosen": -44.02994155883789, + "logps/rejected": -57.28227996826172, + "loss": 0.2084, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.3421705365180969, + "rewards/margins": 3.692906379699707, + "rewards/rejected": -4.035076141357422, + "step": 596 + }, + { + "epoch": 1.3801769457622772, + "grad_norm": 14.254996050777464, + "learning_rate": 1.323453664158769e-07, + "logits/chosen": 0.5193799138069153, + "logits/rejected": 0.5635771155357361, + "logps/chosen": -40.06482696533203, + "logps/rejected": -67.0745620727539, + "loss": 0.2322, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.5795915126800537, + "rewards/margins": 3.6668989658355713, + "rewards/rejected": -4.246490001678467, + "step": 598 + }, + { + "epoch": 1.3847929221695088, + "grad_norm": 18.46063830068681, + "learning_rate": 1.3057004537193422e-07, + "logits/chosen": 0.5273723602294922, + "logits/rejected": 0.5402401685714722, + "logps/chosen": -45.491241455078125, + "logps/rejected": -53.827972412109375, + "loss": 0.185, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.32591530680656433, + "rewards/margins": 3.758335590362549, + "rewards/rejected": -4.084251403808594, + "step": 600 + }, + { + "epoch": 1.3847929221695088, + "eval_logits/chosen": 0.4299531877040863, + "eval_logits/rejected": 0.45649805665016174, + "eval_logps/chosen": -42.37248992919922, + "eval_logps/rejected": -55.565975189208984, + "eval_loss": 0.23996217548847198, + "eval_rewards/accuracies": 0.8300691246986389, + "eval_rewards/chosen": -0.31360533833503723, + "eval_rewards/margins": 3.450216054916382, + "eval_rewards/rejected": -3.7638211250305176, + "eval_runtime": 220.4449, + "eval_samples_per_second": 7.866, + "eval_steps_per_second": 1.969, + "step": 600 + }, + { + "epoch": 1.3894088985767405, + "grad_norm": 24.193490725343704, + "learning_rate": 1.2880249387563662e-07, + "logits/chosen": 0.5480252504348755, + "logits/rejected": 0.5805102586746216, + "logps/chosen": -43.4918098449707, + "logps/rejected": -62.1549072265625, + "loss": 0.1713, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.4662397801876068, + "rewards/margins": 3.974961280822754, + "rewards/rejected": -4.441201210021973, + "step": 602 + }, + { + "epoch": 1.3940248749839723, + "grad_norm": 8.975682909766576, + "learning_rate": 1.2704282691551938e-07, + "logits/chosen": 0.45732539892196655, + "logits/rejected": 0.5041163563728333, + "logps/chosen": -40.32965850830078, + "logps/rejected": -67.52854919433594, + "loss": 0.1754, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.014653444290161133, + "rewards/margins": 4.295289993286133, + "rewards/rejected": -4.280636787414551, + "step": 604 + }, + { + "epoch": 1.398640851391204, + "grad_norm": 27.018968489026342, + "learning_rate": 1.2529115896718714e-07, + "logits/chosen": 0.5242836475372314, + "logits/rejected": 0.5399221777915955, + "logps/chosen": -45.72035217285156, + "logps/rejected": -52.612548828125, + "loss": 0.2076, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.37530067563056946, + "rewards/margins": 3.2071659564971924, + "rewards/rejected": -3.5824666023254395, + "step": 606 + }, + { + "epoch": 1.4032568277984356, + "grad_norm": 13.414881670063712, + "learning_rate": 1.2354760398586708e-07, + "logits/chosen": 0.5383539199829102, + "logits/rejected": 0.5773718953132629, + "logps/chosen": -48.75130081176758, + "logps/rejected": -72.36872863769531, + "loss": 0.1511, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.44930893182754517, + "rewards/margins": 4.512818336486816, + "rewards/rejected": -4.962126731872559, + "step": 608 + }, + { + "epoch": 1.4078728042056674, + "grad_norm": 7.330900567316457, + "learning_rate": 1.2181227539899468e-07, + "logits/chosen": 0.5381309986114502, + "logits/rejected": 0.5586973428726196, + "logps/chosen": -45.09908676147461, + "logps/rejected": -58.20050811767578, + "loss": 0.1744, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.2882673442363739, + "rewards/margins": 3.7085728645324707, + "rewards/rejected": -3.996840238571167, + "step": 610 + }, + { + "epoch": 1.4078728042056674, + "eval_logits/chosen": 0.4304519295692444, + "eval_logits/rejected": 0.45695292949676514, + "eval_logps/chosen": -42.44300842285156, + "eval_logps/rejected": -55.6538200378418, + "eval_loss": 0.238841712474823, + "eval_rewards/accuracies": 0.8352534770965576, + "eval_rewards/chosen": -0.34886524081230164, + "eval_rewards/margins": 3.4588773250579834, + "eval_rewards/rejected": -3.8077423572540283, + "eval_runtime": 220.5308, + "eval_samples_per_second": 7.863, + "eval_steps_per_second": 1.968, + "step": 610 + }, + { + "epoch": 1.412488780612899, + "grad_norm": 10.681757170937171, + "learning_rate": 1.2008528609883557e-07, + "logits/chosen": 0.5007774233818054, + "logits/rejected": 0.5296944379806519, + "logps/chosen": -47.22381591796875, + "logps/rejected": -64.06365966796875, + "loss": 0.1531, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.030609939247369766, + "rewards/margins": 4.320724010467529, + "rewards/rejected": -4.351334571838379, + "step": 612 + }, + { + "epoch": 1.4171047570201307, + "grad_norm": 10.655182313924602, + "learning_rate": 1.1836674843514042e-07, + "logits/chosen": 0.5347999930381775, + "logits/rejected": 0.564474880695343, + "logps/chosen": -37.77484893798828, + "logps/rejected": -54.86954879760742, + "loss": 0.175, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.38236376643180847, + "rewards/margins": 3.763371706008911, + "rewards/rejected": -4.145735263824463, + "step": 614 + }, + { + "epoch": 1.4217207334273625, + "grad_norm": 4.808937007878847, + "learning_rate": 1.1665677420783671e-07, + "logits/chosen": 0.5504859089851379, + "logits/rejected": 0.5750877261161804, + "logps/chosen": -43.14183807373047, + "logps/rejected": -53.28805160522461, + "loss": 0.1417, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.12460337579250336, + "rewards/margins": 3.7694272994995117, + "rewards/rejected": -3.894031047821045, + "step": 616 + }, + { + "epoch": 1.4263367098345943, + "grad_norm": 25.84566759360446, + "learning_rate": 1.149554746597553e-07, + "logits/chosen": 0.5723487734794617, + "logits/rejected": 0.6003535389900208, + "logps/chosen": -45.33318328857422, + "logps/rejected": -59.90052795410156, + "loss": 0.262, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.3526383936405182, + "rewards/margins": 3.843003988265991, + "rewards/rejected": -4.195642471313477, + "step": 618 + }, + { + "epoch": 1.4309526862418258, + "grad_norm": 16.545628594299828, + "learning_rate": 1.1326296046939333e-07, + "logits/chosen": 0.5338951945304871, + "logits/rejected": 0.5544497966766357, + "logps/chosen": -39.78907775878906, + "logps/rejected": -49.23013687133789, + "loss": 0.2511, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.12468406558036804, + "rewards/margins": 3.2776834964752197, + "rewards/rejected": -3.402367353439331, + "step": 620 + }, + { + "epoch": 1.4309526862418258, + "eval_logits/chosen": 0.43395209312438965, + "eval_logits/rejected": 0.46030664443969727, + "eval_logps/chosen": -42.41356658935547, + "eval_logps/rejected": -55.699623107910156, + "eval_loss": 0.23819313943386078, + "eval_rewards/accuracies": 0.8317972421646118, + "eval_rewards/chosen": -0.3341463804244995, + "eval_rewards/margins": 3.4965004920959473, + "eval_rewards/rejected": -3.8306467533111572, + "eval_runtime": 220.4261, + "eval_samples_per_second": 7.867, + "eval_steps_per_second": 1.969, + "step": 620 + }, + { + "epoch": 1.4355686626490576, + "grad_norm": 18.451086465748666, + "learning_rate": 1.1157934174371413e-07, + "logits/chosen": 0.497620165348053, + "logits/rejected": 0.5271977782249451, + "logps/chosen": -44.88563919067383, + "logps/rejected": -63.52084732055664, + "loss": 0.1973, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.4545660614967346, + "rewards/margins": 4.014831066131592, + "rewards/rejected": -4.469396591186523, + "step": 622 + }, + { + "epoch": 1.4401846390562894, + "grad_norm": 15.41826391561629, + "learning_rate": 1.0990472801098419e-07, + "logits/chosen": 0.49964290857315063, + "logits/rejected": 0.5341427326202393, + "logps/chosen": -39.38306427001953, + "logps/rejected": -59.41951370239258, + "loss": 0.1465, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.07668253034353256, + "rewards/margins": 4.010004043579102, + "rewards/rejected": -4.086687088012695, + "step": 624 + }, + { + "epoch": 1.444800615463521, + "grad_norm": 13.657128245878823, + "learning_rate": 1.0823922821364795e-07, + "logits/chosen": 0.5488825440406799, + "logits/rejected": 0.5648425221443176, + "logps/chosen": -49.72515869140625, + "logps/rejected": -57.29216766357422, + "loss": 0.1844, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.15428660809993744, + "rewards/margins": 3.7048492431640625, + "rewards/rejected": -3.859135627746582, + "step": 626 + }, + { + "epoch": 1.4494165918707527, + "grad_norm": 17.171702939592354, + "learning_rate": 1.0658295070124026e-07, + "logits/chosen": 0.5274313688278198, + "logits/rejected": 0.540188729763031, + "logps/chosen": -47.955406188964844, + "logps/rejected": -54.03617477416992, + "loss": 0.2187, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.16990727186203003, + "rewards/margins": 3.60162091255188, + "rewards/rejected": -3.7715280055999756, + "step": 628 + }, + { + "epoch": 1.4540325682779844, + "grad_norm": 25.795693399142227, + "learning_rate": 1.0493600322333762e-07, + "logits/chosen": 0.5215524435043335, + "logits/rejected": 0.5590708255767822, + "logps/chosen": -44.3021354675293, + "logps/rejected": -73.55774688720703, + "loss": 0.141, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.3967975080013275, + "rewards/margins": 4.7301740646362305, + "rewards/rejected": -5.12697172164917, + "step": 630 + }, + { + "epoch": 1.4540325682779844, + "eval_logits/chosen": 0.43174034357070923, + "eval_logits/rejected": 0.4582732319831848, + "eval_logps/chosen": -42.194610595703125, + "eval_logps/rejected": -55.55934524536133, + "eval_loss": 0.23693177103996277, + "eval_rewards/accuracies": 0.8317972421646118, + "eval_rewards/chosen": -0.22466643154621124, + "eval_rewards/margins": 3.535839080810547, + "eval_rewards/rejected": -3.7605059146881104, + "eval_runtime": 220.3801, + "eval_samples_per_second": 7.868, + "eval_steps_per_second": 1.969, + "step": 630 + }, + { + "epoch": 1.458648544685216, + "grad_norm": 14.475820972948407, + "learning_rate": 1.0329849292254883e-07, + "logits/chosen": 0.596792995929718, + "logits/rejected": 0.624647855758667, + "logps/chosen": -45.63186264038086, + "logps/rejected": -62.25794982910156, + "loss": 0.1936, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.2872418463230133, + "rewards/margins": 3.9080302715301514, + "rewards/rejected": -4.195271968841553, + "step": 632 + }, + { + "epoch": 1.4632645210924478, + "grad_norm": 26.862980766739724, + "learning_rate": 1.0167052632754458e-07, + "logits/chosen": 0.5725838541984558, + "logits/rejected": 0.5932745337486267, + "logps/chosen": -41.20800018310547, + "logps/rejected": -51.21732711791992, + "loss": 0.227, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.39695149660110474, + "rewards/margins": 2.928715229034424, + "rewards/rejected": -3.325666666030884, + "step": 634 + }, + { + "epoch": 1.4678804974996795, + "grad_norm": 13.962052681918495, + "learning_rate": 1.0005220934612713e-07, + "logits/chosen": 0.6229636669158936, + "logits/rejected": 0.6402004361152649, + "logps/chosen": -46.95052719116211, + "logps/rejected": -53.86199951171875, + "loss": 0.1824, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.3401051461696625, + "rewards/margins": 3.6175549030303955, + "rewards/rejected": -3.95766019821167, + "step": 636 + }, + { + "epoch": 1.472496473906911, + "grad_norm": 9.092245687630806, + "learning_rate": 9.844364725834056e-08, + "logits/chosen": 0.48213544487953186, + "logits/rejected": 0.5316063761711121, + "logps/chosen": -45.23646545410156, + "logps/rejected": -75.49991607666016, + "loss": 0.0997, + "rewards/accuracies": 0.9583333134651184, + "rewards/chosen": -0.1606331765651703, + "rewards/margins": 5.202739238739014, + "rewards/rejected": -5.363372802734375, + "step": 638 + }, + { + "epoch": 1.4771124503141428, + "grad_norm": 18.96340702396886, + "learning_rate": 9.68449447096217e-08, + "logits/chosen": 0.4373500943183899, + "logits/rejected": 0.4579113721847534, + "logps/chosen": -39.44499588012695, + "logps/rejected": -51.54633712768555, + "loss": 0.3299, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -0.2867163419723511, + "rewards/margins": 3.076793670654297, + "rewards/rejected": -3.3635098934173584, + "step": 640 + }, + { + "epoch": 1.4771124503141428, + "eval_logits/chosen": 0.4346330463886261, + "eval_logits/rejected": 0.461146742105484, + "eval_logps/chosen": -42.071449279785156, + "eval_logps/rejected": -55.46683883666992, + "eval_loss": 0.23784740269184113, + "eval_rewards/accuracies": 0.835829496383667, + "eval_rewards/chosen": -0.16308562457561493, + "eval_rewards/margins": 3.551164388656616, + "eval_rewards/rejected": -3.714250087738037, + "eval_runtime": 220.3881, + "eval_samples_per_second": 7.868, + "eval_steps_per_second": 1.969, + "step": 640 + }, + { + "epoch": 1.4817284267213746, + "grad_norm": 22.570461867090884, + "learning_rate": 9.525620570399259e-08, + "logits/chosen": 0.5038811564445496, + "logits/rejected": 0.5432533025741577, + "logps/chosen": -44.41080856323242, + "logps/rejected": -65.23593139648438, + "loss": 0.1275, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -0.2485545426607132, + "rewards/margins": 4.013004779815674, + "rewards/rejected": -4.261559009552002, + "step": 642 + }, + { + "epoch": 1.4863444031286062, + "grad_norm": 11.127499049370783, + "learning_rate": 9.36775335972943e-08, + "logits/chosen": 0.4518318772315979, + "logits/rejected": 0.531367838382721, + "logps/chosen": -39.415767669677734, + "logps/rejected": -98.71846771240234, + "loss": 0.1566, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.000497970322612673, + "rewards/margins": 6.575231075286865, + "rewards/rejected": -6.575727939605713, + "step": 644 + }, + { + "epoch": 1.490960379535838, + "grad_norm": 24.53509661266678, + "learning_rate": 9.210903109046284e-08, + "logits/chosen": 0.46663856506347656, + "logits/rejected": 0.5147727727890015, + "logps/chosen": -43.30581283569336, + "logps/rejected": -63.16206741333008, + "loss": 0.1684, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.5338683128356934, + "rewards/margins": 4.3571882247924805, + "rewards/rejected": -4.89105749130249, + "step": 646 + }, + { + "epoch": 1.4955763559430697, + "grad_norm": 11.303027411423997, + "learning_rate": 9.05508002228485e-08, + "logits/chosen": 0.529050350189209, + "logits/rejected": 0.5628350377082825, + "logps/chosen": -38.363826751708984, + "logps/rejected": -53.06625747680664, + "loss": 0.2071, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.031818799674510956, + "rewards/margins": 3.961611032485962, + "rewards/rejected": -3.929792642593384, + "step": 648 + }, + { + "epoch": 1.5001923323503012, + "grad_norm": 10.500286558923209, + "learning_rate": 8.900294236557707e-08, + "logits/chosen": 0.49337685108184814, + "logits/rejected": 0.5243138074874878, + "logps/chosen": -37.17765808105469, + "logps/rejected": -49.10523986816406, + "loss": 0.2143, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": 0.008912450633943081, + "rewards/margins": 3.240175485610962, + "rewards/rejected": -3.2312631607055664, + "step": 650 + }, + { + "epoch": 1.5001923323503012, + "eval_logits/chosen": 0.4313080310821533, + "eval_logits/rejected": 0.45790737867355347, + "eval_logps/chosen": -42.17680740356445, + "eval_logps/rejected": -55.66178894042969, + "eval_loss": 0.2398524433374405, + "eval_rewards/accuracies": 0.8306451439857483, + "eval_rewards/chosen": -0.21576282382011414, + "eval_rewards/margins": 3.59596586227417, + "eval_rewards/rejected": -3.8117284774780273, + "eval_runtime": 220.4293, + "eval_samples_per_second": 7.866, + "eval_steps_per_second": 1.969, + "step": 650 + }, + { + "epoch": 1.504808308757533, + "grad_norm": 21.390880404408534, + "learning_rate": 8.746555821495561e-08, + "logits/chosen": 0.4801899492740631, + "logits/rejected": 0.5136987566947937, + "logps/chosen": -43.907596588134766, + "logps/rejected": -62.06863021850586, + "loss": 0.1972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21802374720573425, + "rewards/margins": 4.019637584686279, + "rewards/rejected": -4.237661361694336, + "step": 652 + }, + { + "epoch": 1.5094242851647648, + "grad_norm": 17.814740010117944, + "learning_rate": 8.593874778592122e-08, + "logits/chosen": 0.4772498309612274, + "logits/rejected": 0.5082363486289978, + "logps/chosen": -36.85258483886719, + "logps/rejected": -49.34876251220703, + "loss": 0.1537, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.038329627364873886, + "rewards/margins": 3.5393142700195312, + "rewards/rejected": -3.577643394470215, + "step": 654 + }, + { + "epoch": 1.5140402615719966, + "grad_norm": 24.684686325904988, + "learning_rate": 8.442261040553472e-08, + "logits/chosen": 0.5512763857841492, + "logits/rejected": 0.5618037581443787, + "logps/chosen": -44.694515228271484, + "logps/rejected": -49.48525619506836, + "loss": 0.1683, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.0919620469212532, + "rewards/margins": 3.498401403427124, + "rewards/rejected": -3.406439781188965, + "step": 656 + }, + { + "epoch": 1.518656237979228, + "grad_norm": 21.50701378180569, + "learning_rate": 8.291724470651903e-08, + "logits/chosen": 0.49069249629974365, + "logits/rejected": 0.5210825800895691, + "logps/chosen": -44.639766693115234, + "logps/rejected": -57.28916549682617, + "loss": 0.2335, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.4066303074359894, + "rewards/margins": 3.4069387912750244, + "rewards/rejected": -3.813568592071533, + "step": 658 + }, + { + "epoch": 1.5232722143864597, + "grad_norm": 11.082339838552715, + "learning_rate": 8.14227486208423e-08, + "logits/chosen": 0.4665941596031189, + "logits/rejected": 0.4930134415626526, + "logps/chosen": -37.94073486328125, + "logps/rejected": -53.0433464050293, + "loss": 0.1797, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": 0.09727773815393448, + "rewards/margins": 3.91404128074646, + "rewards/rejected": -3.8167638778686523, + "step": 660 + }, + { + "epoch": 1.5232722143864597, + "eval_logits/chosen": 0.43500614166259766, + "eval_logits/rejected": 0.4616233706474304, + "eval_logps/chosen": -42.075767517089844, + "eval_logps/rejected": -55.58706283569336, + "eval_loss": 0.2391819953918457, + "eval_rewards/accuracies": 0.8306451439857483, + "eval_rewards/chosen": -0.1652439683675766, + "eval_rewards/margins": 3.609118938446045, + "eval_rewards/rejected": -3.774362802505493, + "eval_runtime": 220.4966, + "eval_samples_per_second": 7.864, + "eval_steps_per_second": 1.968, + "step": 660 + }, + { + "epoch": 1.5278881907936914, + "grad_norm": 17.884909353386927, + "learning_rate": 7.993921937334716e-08, + "logits/chosen": 0.5584304332733154, + "logits/rejected": 0.5700749754905701, + "logps/chosen": -41.323944091796875, + "logps/rejected": -49.892147064208984, + "loss": 0.2096, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.27705806493759155, + "rewards/margins": 3.536667823791504, + "rewards/rejected": -3.813725709915161, + "step": 662 + }, + { + "epoch": 1.5325041672009232, + "grad_norm": 6.982953174746173, + "learning_rate": 7.846675347542578e-08, + "logits/chosen": 0.5807335376739502, + "logits/rejected": 0.6132792234420776, + "logps/chosen": -37.81986999511719, + "logps/rejected": -49.71797180175781, + "loss": 0.1272, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": 0.3861154019832611, + "rewards/margins": 4.170031547546387, + "rewards/rejected": -3.783916473388672, + "step": 664 + }, + { + "epoch": 1.537120143608155, + "grad_norm": 18.18022469520284, + "learning_rate": 7.700544671874079e-08, + "logits/chosen": 0.6006969213485718, + "logits/rejected": 0.6162829995155334, + "logps/chosen": -47.33814239501953, + "logps/rejected": -52.70623016357422, + "loss": 0.1962, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.2818297743797302, + "rewards/margins": 3.495248317718506, + "rewards/rejected": -3.7770779132843018, + "step": 666 + }, + { + "epoch": 1.5417361200153867, + "grad_norm": 17.752568042598934, + "learning_rate": 7.555539416899437e-08, + "logits/chosen": 0.5043608546257019, + "logits/rejected": 0.535383939743042, + "logps/chosen": -37.40916442871094, + "logps/rejected": -52.42148971557617, + "loss": 0.2323, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.4006814658641815, + "rewards/margins": 3.385708808898926, + "rewards/rejected": -3.7863900661468506, + "step": 668 + }, + { + "epoch": 1.5463520964226183, + "grad_norm": 14.165329854797266, + "learning_rate": 7.41166901597429e-08, + "logits/chosen": 0.5081818699836731, + "logits/rejected": 0.5341579914093018, + "logps/chosen": -42.154205322265625, + "logps/rejected": -55.97992706298828, + "loss": 0.1774, + "rewards/accuracies": 0.9027777910232544, + "rewards/chosen": -0.05981425940990448, + "rewards/margins": 3.988154172897339, + "rewards/rejected": -4.047967910766602, + "step": 670 + }, + { + "epoch": 1.5463520964226183, + "eval_logits/chosen": 0.4372006952762604, + "eval_logits/rejected": 0.46362602710723877, + "eval_logps/chosen": -42.13774490356445, + "eval_logps/rejected": -55.63636779785156, + "eval_loss": 0.23786574602127075, + "eval_rewards/accuracies": 0.8329492807388306, + "eval_rewards/chosen": -0.19623348116874695, + "eval_rewards/margins": 3.602783203125, + "eval_rewards/rejected": -3.7990164756774902, + "eval_runtime": 220.5205, + "eval_samples_per_second": 7.863, + "eval_steps_per_second": 1.968, + "step": 670 + }, + { + "epoch": 1.5509680728298498, + "grad_norm": 22.84931762442886, + "learning_rate": 7.268942828626046e-08, + "logits/chosen": 0.5015777349472046, + "logits/rejected": 0.5260412096977234, + "logps/chosen": -39.39936828613281, + "logps/rejected": -50.80826950073242, + "loss": 0.2259, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.02117648348212242, + "rewards/margins": 3.6156790256500244, + "rewards/rejected": -3.6368556022644043, + "step": 672 + }, + { + "epoch": 1.5555840492370816, + "grad_norm": 10.729660784502734, + "learning_rate": 7.127370139945018e-08, + "logits/chosen": 0.5064399242401123, + "logits/rejected": 0.542765736579895, + "logps/chosen": -41.118350982666016, + "logps/rejected": -57.55162048339844, + "loss": 0.1581, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.18698811531066895, + "rewards/margins": 4.028824806213379, + "rewards/rejected": -4.215813159942627, + "step": 674 + }, + { + "epoch": 1.5602000256443134, + "grad_norm": 12.758336439580667, + "learning_rate": 6.986960159980326e-08, + "logits/chosen": 0.5471921563148499, + "logits/rejected": 0.5656020045280457, + "logps/chosen": -44.28984069824219, + "logps/rejected": -53.67868423461914, + "loss": 0.1621, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.007483018562197685, + "rewards/margins": 3.514232873916626, + "rewards/rejected": -3.5217158794403076, + "step": 676 + }, + { + "epoch": 1.5648160020515451, + "grad_norm": 25.743372698631337, + "learning_rate": 6.847722023140776e-08, + "logits/chosen": 0.5099420547485352, + "logits/rejected": 0.5306479930877686, + "logps/chosen": -38.24551773071289, + "logps/rejected": -46.37004470825195, + "loss": 0.2453, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.13890628516674042, + "rewards/margins": 3.242166757583618, + "rewards/rejected": -3.381072998046875, + "step": 678 + }, + { + "epoch": 1.569431978458777, + "grad_norm": 29.001544411683714, + "learning_rate": 6.709664787600616e-08, + "logits/chosen": 0.5341071486473083, + "logits/rejected": 0.549387514591217, + "logps/chosen": -38.39107131958008, + "logps/rejected": -45.22284698486328, + "loss": 0.2519, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": -0.32282909750938416, + "rewards/margins": 2.876624822616577, + "rewards/rejected": -3.1994540691375732, + "step": 680 + }, + { + "epoch": 1.569431978458777, + "eval_logits/chosen": 0.4367799460887909, + "eval_logits/rejected": 0.46335569024086, + "eval_logps/chosen": -42.14803695678711, + "eval_logps/rejected": -55.68684005737305, + "eval_loss": 0.23701736330986023, + "eval_rewards/accuracies": 0.8335253596305847, + "eval_rewards/chosen": -0.20137952268123627, + "eval_rewards/margins": 3.622871160507202, + "eval_rewards/rejected": -3.8242506980895996, + "eval_runtime": 220.405, + "eval_samples_per_second": 7.867, + "eval_steps_per_second": 1.969, + "step": 680 + }, + { + "epoch": 1.5740479548660085, + "grad_norm": 26.57226192590101, + "learning_rate": 6.572797434710219e-08, + "logits/chosen": 0.47764989733695984, + "logits/rejected": 0.5231152772903442, + "logps/chosen": -39.2479362487793, + "logps/rejected": -67.22251892089844, + "loss": 0.1985, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.03589929640293121, + "rewards/margins": 4.406409740447998, + "rewards/rejected": -4.370510578155518, + "step": 682 + }, + { + "epoch": 1.57866393127324, + "grad_norm": 7.8158043752344115, + "learning_rate": 6.437128868411856e-08, + "logits/chosen": 0.5327097177505493, + "logits/rejected": 0.5473262071609497, + "logps/chosen": -38.83921813964844, + "logps/rejected": -47.30848693847656, + "loss": 0.212, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": 0.002615167060866952, + "rewards/margins": 3.3942179679870605, + "rewards/rejected": -3.3916027545928955, + "step": 684 + }, + { + "epoch": 1.5832799076804718, + "grad_norm": 11.10012939486401, + "learning_rate": 6.302667914660384e-08, + "logits/chosen": 0.5219799280166626, + "logits/rejected": 0.55839604139328, + "logps/chosen": -37.46578598022461, + "logps/rejected": -54.46531295776367, + "loss": 0.2233, + "rewards/accuracies": 0.8472222089767456, + "rewards/chosen": 0.02391706593334675, + "rewards/margins": 3.7034101486206055, + "rewards/rejected": -3.679492950439453, + "step": 686 + }, + { + "epoch": 1.5878958840877035, + "grad_norm": 19.67549763311113, + "learning_rate": 6.169423320849112e-08, + "logits/chosen": 0.5211795568466187, + "logits/rejected": 0.5298517346382141, + "logps/chosen": -45.8150520324707, + "logps/rejected": -47.33256149291992, + "loss": 0.2021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2716074287891388, + "rewards/margins": 3.559727191925049, + "rewards/rejected": -3.831334352493286, + "step": 688 + }, + { + "epoch": 1.5925118604949353, + "grad_norm": 15.711220514951888, + "learning_rate": 6.037403755240748e-08, + "logits/chosen": 0.5544189810752869, + "logits/rejected": 0.5787670612335205, + "logps/chosen": -45.216304779052734, + "logps/rejected": -59.76258850097656, + "loss": 0.1572, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -0.14509858191013336, + "rewards/margins": 3.88366436958313, + "rewards/rejected": -4.0287628173828125, + "step": 690 + }, + { + "epoch": 1.5925118604949353, + "eval_logits/chosen": 0.43276646733283997, + "eval_logits/rejected": 0.45934849977493286, + "eval_logps/chosen": -42.20445251464844, + "eval_logps/rejected": -55.753753662109375, + "eval_loss": 0.23724210262298584, + "eval_rewards/accuracies": 0.8317972421646118, + "eval_rewards/chosen": -0.2295861542224884, + "eval_rewards/margins": 3.6281206607818604, + "eval_rewards/rejected": -3.8577067852020264, + "eval_runtime": 220.4833, + "eval_samples_per_second": 7.865, + "eval_steps_per_second": 1.968, + "step": 690 + }, + { + "epoch": 1.597127836902167, + "grad_norm": 14.487508826565733, + "learning_rate": 5.9066178064034326e-08, + "logits/chosen": 0.4430210590362549, + "logits/rejected": 0.4965353012084961, + "logps/chosen": -33.27760696411133, + "logps/rejected": -71.74127197265625, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2861379384994507, + "rewards/margins": 4.55012321472168, + "rewards/rejected": -4.836262226104736, + "step": 692 + }, + { + "epoch": 1.6017438133093986, + "grad_norm": 23.580990452467088, + "learning_rate": 5.777073982652064e-08, + "logits/chosen": 0.5170236825942993, + "logits/rejected": 0.5521243214607239, + "logps/chosen": -35.71030044555664, + "logps/rejected": -52.74575424194336, + "loss": 0.2247, + "rewards/accuracies": 0.8611111044883728, + "rewards/chosen": -0.3935600519180298, + "rewards/margins": 3.574741840362549, + "rewards/rejected": -3.96830153465271, + "step": 694 + }, + { + "epoch": 1.6063597897166302, + "grad_norm": 13.54068941517088, + "learning_rate": 5.6487807114947325e-08, + "logits/chosen": 0.551853358745575, + "logits/rejected": 0.5928479433059692, + "logps/chosen": -42.63957214355469, + "logps/rejected": -70.68295288085938, + "loss": 0.1803, + "rewards/accuracies": 0.8888888955116272, + "rewards/chosen": -0.2251981645822525, + "rewards/margins": 4.277625560760498, + "rewards/rejected": -4.502823352813721, + "step": 696 + }, + { + "epoch": 1.610975766123862, + "grad_norm": 27.742044897151906, + "learning_rate": 5.521746339084532e-08, + "logits/chosen": 0.5765677094459534, + "logits/rejected": 0.5921374559402466, + "logps/chosen": -47.175655364990234, + "logps/rejected": -58.09642028808594, + "loss": 0.2516, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -0.3188338875770569, + "rewards/margins": 3.57645583152771, + "rewards/rejected": -3.895289897918701, + "step": 698 + }, + { + "epoch": 1.6155917425310937, + "grad_norm": 13.652878320465026, + "learning_rate": 5.39597912967652e-08, + "logits/chosen": 0.5359885692596436, + "logits/rejected": 0.575743556022644, + "logps/chosen": -38.843807220458984, + "logps/rejected": -61.49338150024414, + "loss": 0.1886, + "rewards/accuracies": 0.9166666865348816, + "rewards/chosen": -0.01514108944684267, + "rewards/margins": 4.108646392822266, + "rewards/rejected": -4.1237874031066895, + "step": 700 + }, + { + "epoch": 1.6155917425310937, + "eval_logits/chosen": 0.43191081285476685, + "eval_logits/rejected": 0.4585791528224945, + "eval_logps/chosen": -42.20844268798828, + "eval_logps/rejected": -55.773094177246094, + "eval_loss": 0.23592650890350342, + "eval_rewards/accuracies": 0.8364055156707764, + "eval_rewards/chosen": -0.23158276081085205, + "eval_rewards/margins": 3.635798692703247, + "eval_rewards/rejected": -3.8673815727233887, + "eval_runtime": 220.5019, + "eval_samples_per_second": 7.864, + "eval_steps_per_second": 1.968, + "step": 700 + } + ], + "logging_steps": 2, + "max_steps": 866, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}