diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,20 +1,20 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9992254066615027, + "epoch": 3.0, "eval_steps": 100, - "global_step": 2904, + "global_step": 5811, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 1.7182130584192438e-09, - "logits/chosen": -2.8386430740356445, - "logits/rejected": -2.8774726390838623, - "logps/chosen": -396.7501220703125, - "logps/rejected": -306.6087951660156, + "learning_rate": 8.591065292096219e-10, + "logits/chosen": -2.7645795345306396, + "logits/rejected": -2.8125061988830566, + "logps/chosen": -113.67314910888672, + "logps/rejected": -132.0498504638672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,4540 +24,9078 @@ }, { "epoch": 0.01, - "learning_rate": 1.718213058419244e-08, - "logits/chosen": -2.9410815238952637, - "logits/rejected": -2.9279916286468506, - "logps/chosen": -364.9696350097656, - "logps/rejected": -268.6126403808594, - "loss": 0.6943, - "rewards/accuracies": 0.4722222089767456, - "rewards/chosen": 0.005337627604603767, - "rewards/margins": 0.005309337750077248, - "rewards/rejected": 2.8289776309975423e-05, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -2.960064649581909, + "logits/rejected": -2.973264694213867, + "logps/chosen": -281.9268798828125, + "logps/rejected": -290.8782958984375, + "loss": 0.693, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -5.9445181250339374e-05, + "rewards/margins": -0.0016730944626033306, + "rewards/rejected": 0.0016136488411575556, "step": 10 }, { - "epoch": 0.02, - "learning_rate": 3.436426116838488e-08, - "logits/chosen": -2.930140972137451, - "logits/rejected": -2.9554715156555176, - "logps/chosen": -357.8751525878906, - "logps/rejected": -295.104736328125, - "loss": 0.6828, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.05925828218460083, - "rewards/margins": 0.02539578638970852, - "rewards/rejected": 0.03386249393224716, + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.9243266582489014, + "logits/rejected": -2.9458937644958496, + "logps/chosen": -219.5768280029297, + "logps/rejected": -229.09963989257812, + "loss": 0.6934, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009228921495378017, + "rewards/margins": 0.006980190984904766, + "rewards/rejected": 0.0022487309761345387, "step": 20 }, { - "epoch": 0.03, - "learning_rate": 5.154639175257731e-08, - "logits/chosen": -2.9392919540405273, - "logits/rejected": -2.9263033866882324, - "logps/chosen": -336.83807373046875, - "logps/rejected": -283.37139892578125, - "loss": 0.6563, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.17703184485435486, - "rewards/margins": 0.07941180467605591, - "rewards/rejected": 0.09762003272771835, + "epoch": 0.02, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -2.918198347091675, + "logits/rejected": -2.9479143619537354, + "logps/chosen": -284.60943603515625, + "logps/rejected": -312.8995056152344, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0206824392080307, + "rewards/margins": 0.003921012859791517, + "rewards/rejected": 0.016761427745223045, "step": 30 }, { - "epoch": 0.04, - "learning_rate": 6.872852233676976e-08, - "logits/chosen": -2.9387967586517334, - "logits/rejected": -2.9206314086914062, - "logps/chosen": -384.8312683105469, - "logps/rejected": -306.2314453125, - "loss": 0.6442, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.3545844554901123, - "rewards/margins": 0.17298033833503723, - "rewards/rejected": 0.18160411715507507, + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.8567323684692383, + "logits/rejected": -2.896390438079834, + "logps/chosen": -322.072021484375, + "logps/rejected": -279.9306945800781, + "loss": 0.6811, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04524571821093559, + "rewards/margins": 0.03146430477499962, + "rewards/rejected": 0.013781411573290825, "step": 40 }, { - "epoch": 0.05, - "learning_rate": 8.59106529209622e-08, - "logits/chosen": -2.9341976642608643, - "logits/rejected": -2.901994466781616, - "logps/chosen": -324.2142639160156, - "logps/rejected": -247.2388458251953, - "loss": 0.6214, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.37454044818878174, - "rewards/margins": 0.2947521507740021, - "rewards/rejected": 0.07978831231594086, + "epoch": 0.03, + "learning_rate": 4.29553264604811e-08, + "logits/chosen": -2.9978153705596924, + "logits/rejected": -3.0320467948913574, + "logps/chosen": -201.69992065429688, + "logps/rejected": -236.61203002929688, + "loss": 0.6587, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.08155672252178192, + "rewards/margins": 0.07839250564575195, + "rewards/rejected": 0.0031642187386751175, "step": 50 }, { - "epoch": 0.06, - "learning_rate": 1.0309278350515462e-07, - "logits/chosen": -2.929769992828369, - "logits/rejected": -2.9284074306488037, - "logps/chosen": -358.61199951171875, - "logps/rejected": -272.9038391113281, - "loss": 0.6117, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.5510571599006653, - "rewards/margins": 0.40916723012924194, - "rewards/rejected": 0.14188989996910095, + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.8887217044830322, + "logits/rejected": -2.8648734092712402, + "logps/chosen": -250.0323028564453, + "logps/rejected": -308.8505859375, + "loss": 0.6368, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14510753750801086, + "rewards/margins": 0.12705275416374207, + "rewards/rejected": 0.01805477775633335, "step": 60 }, { - "epoch": 0.07, - "learning_rate": 1.202749140893471e-07, - "logits/chosen": -2.9810051918029785, - "logits/rejected": -2.98490571975708, - "logps/chosen": -334.68927001953125, - "logps/rejected": -265.19873046875, - "loss": 0.6119, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.46736612915992737, - "rewards/margins": 0.40175753831863403, - "rewards/rejected": 0.06560859829187393, + "epoch": 0.04, + "learning_rate": 6.013745704467354e-08, + "logits/chosen": -2.95893931388855, + "logits/rejected": -2.941819190979004, + "logps/chosen": -304.4744567871094, + "logps/rejected": -301.7149353027344, + "loss": 0.6367, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23246800899505615, + "rewards/margins": 0.17572557926177979, + "rewards/rejected": 0.056742388755083084, "step": 70 }, { - "epoch": 0.08, - "learning_rate": 1.3745704467353952e-07, - "logits/chosen": -2.9579415321350098, - "logits/rejected": -2.9277639389038086, - "logps/chosen": -346.24163818359375, - "logps/rejected": -263.8692321777344, - "loss": 0.558, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4413931965827942, - "rewards/margins": 0.5552295446395874, - "rewards/rejected": -0.1138363927602768, + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.955170154571533, + "logits/rejected": -2.9491288661956787, + "logps/chosen": -286.9136047363281, + "logps/rejected": -303.0902099609375, + "loss": 0.5875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.30379384756088257, + "rewards/margins": 0.26180416345596313, + "rewards/rejected": 0.04198961704969406, "step": 80 }, { - "epoch": 0.09, - "learning_rate": 1.5463917525773197e-07, - "logits/chosen": -2.9735989570617676, - "logits/rejected": -2.9386916160583496, - "logps/chosen": -307.0599670410156, - "logps/rejected": -263.30126953125, - "loss": 0.5294, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.45517802238464355, - "rewards/margins": 0.5852676630020142, - "rewards/rejected": -0.1300896406173706, + "epoch": 0.05, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -2.949690341949463, + "logits/rejected": -3.017688274383545, + "logps/chosen": -249.6548309326172, + "logps/rejected": -293.2023620605469, + "loss": 0.569, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.33933955430984497, + "rewards/margins": 0.3257867693901062, + "rewards/rejected": 0.013552774675190449, "step": 90 }, { - "epoch": 0.1, - "learning_rate": 1.718213058419244e-07, - "logits/chosen": -2.8946220874786377, - "logits/rejected": -2.891949415206909, - "logps/chosen": -353.20416259765625, - "logps/rejected": -249.47891235351562, - "loss": 0.5513, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": 0.47438564896583557, - "rewards/margins": 0.6653046607971191, - "rewards/rejected": -0.1909189671278, + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.8686530590057373, + "logits/rejected": -2.9002976417541504, + "logps/chosen": -276.3162841796875, + "logps/rejected": -236.25296020507812, + "loss": 0.5613, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4270657002925873, + "rewards/margins": 0.36059075593948364, + "rewards/rejected": 0.06647494435310364, "step": 100 }, { - "epoch": 0.1, - "eval_logits/chosen": -2.9198687076568604, - "eval_logits/rejected": -2.904863119125366, - "eval_logps/chosen": -348.2250061035156, - "eval_logps/rejected": -286.0747375488281, - "eval_loss": 0.5430884957313538, - "eval_rewards/accuracies": 0.765999972820282, - "eval_rewards/chosen": 0.4640864431858063, - "eval_rewards/margins": 0.6596349477767944, - "eval_rewards/rejected": -0.19554853439331055, - "eval_runtime": 499.9395, - "eval_samples_per_second": 4.0, - "eval_steps_per_second": 0.5, + "epoch": 0.05, + "eval_logits/chosen": -2.904775619506836, + "eval_logits/rejected": -2.9494807720184326, + "eval_logps/chosen": -243.06394958496094, + "eval_logps/rejected": -275.9722595214844, + "eval_loss": 0.5542330145835876, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": 0.46162015199661255, + "eval_rewards/margins": 0.4451034665107727, + "eval_rewards/rejected": 0.01651667058467865, + "eval_runtime": 278.9085, + "eval_samples_per_second": 7.171, + "eval_steps_per_second": 0.448, "step": 100 }, { - "epoch": 0.11, - "learning_rate": 1.8900343642611682e-07, - "logits/chosen": -2.873088836669922, - "logits/rejected": -2.864135265350342, - "logps/chosen": -339.3306884765625, - "logps/rejected": -271.645751953125, - "loss": 0.5789, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.2165219783782959, - "rewards/margins": 0.534845769405365, - "rewards/rejected": -0.31832385063171387, + "epoch": 0.06, + "learning_rate": 9.450171821305841e-08, + "logits/chosen": -2.873464584350586, + "logits/rejected": -2.9402079582214355, + "logps/chosen": -246.4360809326172, + "logps/rejected": -280.4523620605469, + "loss": 0.5523, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4688049256801605, + "rewards/margins": 0.5279902219772339, + "rewards/rejected": -0.0591852143406868, "step": 110 }, { - "epoch": 0.12, - "learning_rate": 2.0618556701030925e-07, - "logits/chosen": -2.948822259902954, - "logits/rejected": -2.9265244007110596, - "logps/chosen": -339.6183166503906, - "logps/rejected": -278.4152526855469, - "loss": 0.55, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.3460471034049988, - "rewards/margins": 0.47235360741615295, - "rewards/rejected": -0.12630648910999298, + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.9436380863189697, + "logits/rejected": -2.9717044830322266, + "logps/chosen": -210.0995635986328, + "logps/rejected": -235.1615447998047, + "loss": 0.5107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4169088900089264, + "rewards/margins": 0.5272529125213623, + "rewards/rejected": -0.1103440523147583, "step": 120 }, { - "epoch": 0.13, - "learning_rate": 2.2336769759450173e-07, - "logits/chosen": -2.9342408180236816, - "logits/rejected": -2.939258098602295, - "logps/chosen": -329.4230651855469, - "logps/rejected": -291.20989990234375, - "loss": 0.5407, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.4995655417442322, - "rewards/margins": 0.7382161617279053, - "rewards/rejected": -0.23865056037902832, + "epoch": 0.07, + "learning_rate": 1.1168384879725086e-07, + "logits/chosen": -3.001401901245117, + "logits/rejected": -3.032275676727295, + "logps/chosen": -285.7945556640625, + "logps/rejected": -291.1168518066406, + "loss": 0.5105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.639220118522644, + "rewards/margins": 0.6806271076202393, + "rewards/rejected": -0.04140689969062805, "step": 130 }, { - "epoch": 0.14, - "learning_rate": 2.405498281786942e-07, - "logits/chosen": -2.9660542011260986, - "logits/rejected": -2.9834768772125244, - "logps/chosen": -349.0820007324219, - "logps/rejected": -262.83416748046875, - "loss": 0.5112, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4791792035102844, - "rewards/margins": 0.8585275411605835, - "rewards/rejected": -0.37934836745262146, + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.940995931625366, + "logits/rejected": -2.9406614303588867, + "logps/chosen": -285.1832275390625, + "logps/rejected": -329.8865661621094, + "loss": 0.507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5072210431098938, + "rewards/margins": 0.8132984042167664, + "rewards/rejected": -0.3060774505138397, "step": 140 }, { - "epoch": 0.15, - "learning_rate": 2.5773195876288655e-07, - "logits/chosen": -2.9196953773498535, - "logits/rejected": -2.9103140830993652, - "logps/chosen": -347.46453857421875, - "logps/rejected": -260.7893981933594, - "loss": 0.539, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.5144405364990234, - "rewards/margins": 0.822402834892273, - "rewards/rejected": -0.3079623579978943, + "epoch": 0.08, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -2.9889254570007324, + "logits/rejected": -2.98456072807312, + "logps/chosen": -220.37387084960938, + "logps/rejected": -282.42962646484375, + "loss": 0.4732, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5785535573959351, + "rewards/margins": 0.9343128204345703, + "rewards/rejected": -0.3557590842247009, "step": 150 }, { - "epoch": 0.17, - "learning_rate": 2.7491408934707903e-07, - "logits/chosen": -2.9616494178771973, - "logits/rejected": -2.9489588737487793, - "logps/chosen": -328.7740478515625, - "logps/rejected": -265.5461730957031, - "loss": 0.5049, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.6073015928268433, - "rewards/margins": 0.9107543230056763, - "rewards/rejected": -0.3034527003765106, + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.8972601890563965, + "logits/rejected": -2.9279913902282715, + "logps/chosen": -272.812255859375, + "logps/rejected": -280.38641357421875, + "loss": 0.4883, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5051306486129761, + "rewards/margins": 0.8609280586242676, + "rewards/rejected": -0.35579735040664673, "step": 160 }, { - "epoch": 0.18, - "learning_rate": 2.9209621993127146e-07, - "logits/chosen": -2.9152207374572754, - "logits/rejected": -2.923574924468994, - "logps/chosen": -343.9161071777344, - "logps/rejected": -280.7509765625, - "loss": 0.5156, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.4770805239677429, - "rewards/margins": 1.0360281467437744, - "rewards/rejected": -0.5589475631713867, + "epoch": 0.09, + "learning_rate": 1.4604810996563573e-07, + "logits/chosen": -2.9837818145751953, + "logits/rejected": -2.9798386096954346, + "logps/chosen": -236.0260009765625, + "logps/rejected": -272.0481262207031, + "loss": 0.426, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6715840101242065, + "rewards/margins": 1.2586230039596558, + "rewards/rejected": -0.5870389938354492, "step": 170 }, { - "epoch": 0.19, - "learning_rate": 3.0927835051546394e-07, - "logits/chosen": -2.907759428024292, - "logits/rejected": -2.9071803092956543, - "logps/chosen": -330.72161865234375, - "logps/rejected": -265.810302734375, - "loss": 0.4979, - "rewards/accuracies": 0.78125, - "rewards/chosen": 0.33995530009269714, - "rewards/margins": 0.9378688931465149, - "rewards/rejected": -0.5979136228561401, + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.9777297973632812, + "logits/rejected": -2.9535796642303467, + "logps/chosen": -179.25314331054688, + "logps/rejected": -235.5211944580078, + "loss": 0.4372, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6480907797813416, + "rewards/margins": 0.9972507357597351, + "rewards/rejected": -0.3491598665714264, "step": 180 }, { - "epoch": 0.2, - "learning_rate": 3.2646048109965636e-07, - "logits/chosen": -2.939296245574951, - "logits/rejected": -2.9180989265441895, - "logps/chosen": -300.32318115234375, - "logps/rejected": -250.5105743408203, - "loss": 0.5614, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.38950082659721375, - "rewards/margins": 0.9293573498725891, - "rewards/rejected": -0.539856493473053, + "epoch": 0.1, + "learning_rate": 1.6323024054982818e-07, + "logits/chosen": -2.9201600551605225, + "logits/rejected": -2.931079626083374, + "logps/chosen": -259.4476013183594, + "logps/rejected": -317.50244140625, + "loss": 0.4526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.7379086017608643, + "rewards/margins": 1.142943263053894, + "rewards/rejected": -0.4050346910953522, "step": 190 }, { - "epoch": 0.21, - "learning_rate": 3.436426116838488e-07, - "logits/chosen": -2.9912192821502686, - "logits/rejected": -2.9657769203186035, - "logps/chosen": -335.5823669433594, - "logps/rejected": -279.8517761230469, - "loss": 0.5322, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.3443685472011566, - "rewards/margins": 0.8442217111587524, - "rewards/rejected": -0.49985313415527344, + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.8957831859588623, + "logits/rejected": -2.929515838623047, + "logps/chosen": -302.5355224609375, + "logps/rejected": -221.0574951171875, + "loss": 0.4215, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5478682518005371, + "rewards/margins": 1.2445906400680542, + "rewards/rejected": -0.6967223882675171, "step": 200 }, { - "epoch": 0.21, - "eval_logits/chosen": -2.9475507736206055, - "eval_logits/rejected": -2.9307024478912354, - "eval_logps/chosen": -348.5544128417969, - "eval_logps/rejected": -291.5714111328125, - "eval_loss": 0.5251129269599915, - "eval_rewards/accuracies": 0.7680000066757202, - "eval_rewards/chosen": 0.4311439096927643, - "eval_rewards/margins": 1.1763547658920288, - "eval_rewards/rejected": -0.7452106475830078, - "eval_runtime": 500.63, - "eval_samples_per_second": 3.995, - "eval_steps_per_second": 0.499, + "epoch": 0.1, + "eval_logits/chosen": -2.8915369510650635, + "eval_logits/rejected": -2.9388110637664795, + "eval_logps/chosen": -242.70468139648438, + "eval_logps/rejected": -283.1268310546875, + "eval_loss": 0.46274256706237793, + "eval_rewards/accuracies": 0.7839999794960022, + "eval_rewards/chosen": 0.4975453019142151, + "eval_rewards/margins": 1.1964877843856812, + "eval_rewards/rejected": -0.6989425420761108, + "eval_runtime": 278.9272, + "eval_samples_per_second": 7.17, + "eval_steps_per_second": 0.448, "step": 200 }, { - "epoch": 0.22, - "learning_rate": 3.608247422680412e-07, - "logits/chosen": -2.9167776107788086, - "logits/rejected": -2.909834384918213, - "logps/chosen": -347.0169372558594, - "logps/rejected": -276.9785461425781, - "loss": 0.4708, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.6056002974510193, - "rewards/margins": 1.2408610582351685, - "rewards/rejected": -0.6352607011795044, + "epoch": 0.11, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -2.8694968223571777, + "logits/rejected": -2.865325689315796, + "logps/chosen": -251.77734375, + "logps/rejected": -315.6266174316406, + "loss": 0.4789, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.41053009033203125, + "rewards/margins": 1.1092506647109985, + "rewards/rejected": -0.6987205743789673, "step": 210 }, { - "epoch": 0.23, - "learning_rate": 3.7800687285223364e-07, - "logits/chosen": -2.945709705352783, - "logits/rejected": -2.9418673515319824, - "logps/chosen": -314.41168212890625, - "logps/rejected": -284.7486877441406, - "loss": 0.5639, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.4781018793582916, - "rewards/margins": 1.122393012046814, - "rewards/rejected": -0.6442912220954895, + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.955479145050049, + "logits/rejected": -2.920375108718872, + "logps/chosen": -290.3009338378906, + "logps/rejected": -279.76873779296875, + "loss": 0.4596, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.48892465233802795, + "rewards/margins": 1.2118638753890991, + "rewards/rejected": -0.7229393720626831, "step": 220 }, { - "epoch": 0.24, - "learning_rate": 3.9518900343642607e-07, - "logits/chosen": -2.9685254096984863, - "logits/rejected": -2.9509358406066895, - "logps/chosen": -348.00921630859375, - "logps/rejected": -302.19171142578125, - "loss": 0.5851, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.428355872631073, - "rewards/margins": 1.09479820728302, - "rewards/rejected": -0.666442334651947, + "epoch": 0.12, + "learning_rate": 1.9759450171821303e-07, + "logits/chosen": -2.9636857509613037, + "logits/rejected": -2.9484169483184814, + "logps/chosen": -257.77325439453125, + "logps/rejected": -290.58416748046875, + "loss": 0.4495, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5090667009353638, + "rewards/margins": 1.1924711465835571, + "rewards/rejected": -0.6834043264389038, "step": 230 }, { - "epoch": 0.25, - "learning_rate": 4.123711340206185e-07, - "logits/chosen": -3.018501043319702, - "logits/rejected": -3.016463041305542, - "logps/chosen": -315.842529296875, - "logps/rejected": -263.87420654296875, - "loss": 0.4913, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.3246752619743347, - "rewards/margins": 1.0819722414016724, - "rewards/rejected": -0.7572969198226929, + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.9927587509155273, + "logits/rejected": -2.982868194580078, + "logps/chosen": -300.302978515625, + "logps/rejected": -264.7530212402344, + "loss": 0.4626, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.48255831003189087, + "rewards/margins": 1.3429360389709473, + "rewards/rejected": -0.8603779077529907, "step": 240 }, { - "epoch": 0.26, - "learning_rate": 4.2955326460481097e-07, - "logits/chosen": -3.021206855773926, - "logits/rejected": -3.0122694969177246, - "logps/chosen": -387.8692626953125, - "logps/rejected": -301.49224853515625, - "loss": 0.5462, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.2839759886264801, - "rewards/margins": 1.1730202436447144, - "rewards/rejected": -0.8890441656112671, + "epoch": 0.13, + "learning_rate": 2.1477663230240549e-07, + "logits/chosen": -3.0206644535064697, + "logits/rejected": -3.019110679626465, + "logps/chosen": -266.93206787109375, + "logps/rejected": -293.4591369628906, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3956943452358246, + "rewards/margins": 1.2525049448013306, + "rewards/rejected": -0.8568106889724731, "step": 250 }, { - "epoch": 0.27, - "learning_rate": 4.4673539518900345e-07, - "logits/chosen": -3.0731797218322754, - "logits/rejected": -3.0552587509155273, - "logps/chosen": -328.5102844238281, - "logps/rejected": -269.9062194824219, - "loss": 0.6421, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.0661497563123703, - "rewards/margins": 0.8637269735336304, - "rewards/rejected": -0.7975772023200989, + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.004024028778076, + "logits/rejected": -2.9846959114074707, + "logps/chosen": -276.8220520019531, + "logps/rejected": -279.55718994140625, + "loss": 0.423, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.7008559703826904, + "rewards/margins": 1.6333271265029907, + "rewards/rejected": -0.932470977306366, "step": 260 }, { - "epoch": 0.28, - "learning_rate": 4.639175257731959e-07, - "logits/chosen": -3.074122905731201, - "logits/rejected": -3.061890125274658, - "logps/chosen": -315.14642333984375, - "logps/rejected": -286.4360046386719, - "loss": 0.5662, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.12611123919487, - "rewards/margins": 0.9487603902816772, - "rewards/rejected": -0.8226491808891296, + "epoch": 0.14, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -2.964247941970825, + "logits/rejected": -3.0104923248291016, + "logps/chosen": -233.50796508789062, + "logps/rejected": -284.2912902832031, + "loss": 0.3909, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5938805341720581, + "rewards/margins": 1.5185127258300781, + "rewards/rejected": -0.92463219165802, "step": 270 }, { - "epoch": 0.29, - "learning_rate": 4.810996563573884e-07, - "logits/chosen": -2.9959123134613037, - "logits/rejected": -2.9872357845306396, - "logps/chosen": -372.4709167480469, - "logps/rejected": -292.8404541015625, - "loss": 0.6315, - "rewards/accuracies": 0.78125, - "rewards/chosen": 0.5914579629898071, - "rewards/margins": 1.3111344575881958, - "rewards/rejected": -0.7196764945983887, + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.899641275405884, + "logits/rejected": -2.943345546722412, + "logps/chosen": -300.43194580078125, + "logps/rejected": -302.929931640625, + "loss": 0.4353, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.38024798035621643, + "rewards/margins": 1.3194282054901123, + "rewards/rejected": -0.9391803741455078, "step": 280 }, { - "epoch": 0.3, - "learning_rate": 4.982817869415807e-07, - "logits/chosen": -3.0343637466430664, - "logits/rejected": -2.992107391357422, - "logps/chosen": -342.8162536621094, - "logps/rejected": -284.862548828125, - "loss": 0.569, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.5013134479522705, - "rewards/margins": 1.26241135597229, - "rewards/rejected": -0.7610978484153748, + "epoch": 0.15, + "learning_rate": 2.4914089347079036e-07, + "logits/chosen": -2.897779941558838, + "logits/rejected": -2.932300090789795, + "logps/chosen": -289.35992431640625, + "logps/rejected": -324.7225341796875, + "loss": 0.4472, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.767623782157898, + "rewards/margins": 1.861802101135254, + "rewards/rejected": -1.0941781997680664, "step": 290 }, { - "epoch": 0.31, - "learning_rate": 4.982778415614236e-07, - "logits/chosen": -2.9965505599975586, - "logits/rejected": -3.0124495029449463, - "logps/chosen": -305.66571044921875, - "logps/rejected": -315.95111083984375, - "loss": 0.602, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.5931826829910278, - "rewards/margins": 1.2151721715927124, - "rewards/rejected": -0.6219894886016846, + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.9786391258239746, + "logits/rejected": -2.98624849319458, + "logps/chosen": -258.1387634277344, + "logps/rejected": -287.59051513671875, + "loss": 0.4508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4574419856071472, + "rewards/margins": 1.4439876079559326, + "rewards/rejected": -0.9865456819534302, "step": 300 }, { - "epoch": 0.31, - "eval_logits/chosen": -2.99521541595459, - "eval_logits/rejected": -2.96345853805542, - "eval_logps/chosen": -347.50177001953125, - "eval_logps/rejected": -290.2912902832031, - "eval_loss": 0.5439262986183167, - "eval_rewards/accuracies": 0.7440000176429749, - "eval_rewards/chosen": 0.5364080667495728, - "eval_rewards/margins": 1.1536086797714233, - "eval_rewards/rejected": -0.6172006130218506, - "eval_runtime": 500.8971, - "eval_samples_per_second": 3.993, - "eval_steps_per_second": 0.499, + "epoch": 0.15, + "eval_logits/chosen": -2.9006266593933105, + "eval_logits/rejected": -2.951172351837158, + "eval_logps/chosen": -243.17063903808594, + "eval_logps/rejected": -287.9976806640625, + "eval_loss": 0.47074779868125916, + "eval_rewards/accuracies": 0.7839999794960022, + "eval_rewards/chosen": 0.4509522318840027, + "eval_rewards/margins": 1.6369801759719849, + "eval_rewards/rejected": -1.1860281229019165, + "eval_runtime": 278.6174, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.449, "step": 300 }, { - "epoch": 0.32, - "learning_rate": 4.963643321852277e-07, - "logits/chosen": -3.0448598861694336, - "logits/rejected": -3.017517566680908, - "logps/chosen": -353.1573486328125, - "logps/rejected": -284.19000244140625, - "loss": 0.5839, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.37520498037338257, - "rewards/margins": 0.9747766256332397, - "rewards/rejected": -0.5995717644691467, + "epoch": 0.16, + "learning_rate": 2.663230240549828e-07, + "logits/chosen": -3.018040180206299, + "logits/rejected": -3.0253515243530273, + "logps/chosen": -266.28204345703125, + "logps/rejected": -265.2425842285156, + "loss": 0.437, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.318104088306427, + "rewards/margins": 1.3584096431732178, + "rewards/rejected": -1.040305733680725, "step": 310 }, { - "epoch": 0.33, - "learning_rate": 4.944508228090318e-07, - "logits/chosen": -3.0423622131347656, - "logits/rejected": -2.9570577144622803, - "logps/chosen": -302.97955322265625, - "logps/rejected": -245.19552612304688, - "loss": 0.5714, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5704945921897888, - "rewards/margins": 0.9897972345352173, - "rewards/rejected": -0.41930264234542847, + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.9877212047576904, + "logits/rejected": -2.993577241897583, + "logps/chosen": -281.7521057128906, + "logps/rejected": -267.09228515625, + "loss": 0.4209, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.4347918927669525, + "rewards/margins": 1.756643295288086, + "rewards/rejected": -1.3218514919281006, "step": 320 }, { - "epoch": 0.34, - "learning_rate": 4.925373134328357e-07, - "logits/chosen": -3.0704784393310547, - "logits/rejected": -3.0522284507751465, - "logps/chosen": -336.61260986328125, - "logps/rejected": -285.2626037597656, - "loss": 0.5327, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.5059818029403687, - "rewards/margins": 1.1841099262237549, - "rewards/rejected": -0.6781281232833862, + "epoch": 0.17, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -2.974451780319214, + "logits/rejected": -2.9852652549743652, + "logps/chosen": -280.6615295410156, + "logps/rejected": -287.22967529296875, + "loss": 0.346, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6992789506912231, + "rewards/margins": 2.1831793785095215, + "rewards/rejected": -1.483900547027588, "step": 330 }, { - "epoch": 0.35, - "learning_rate": 4.906238040566398e-07, - "logits/chosen": -2.9792017936706543, - "logits/rejected": -3.0039639472961426, - "logps/chosen": -294.2068786621094, - "logps/rejected": -279.0820617675781, - "loss": 0.6637, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.2278396338224411, - "rewards/margins": 0.7685919404029846, - "rewards/rejected": -0.5407522916793823, + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.8814268112182617, + "logits/rejected": -2.9174575805664062, + "logps/chosen": -279.43878173828125, + "logps/rejected": -296.05908203125, + "loss": 0.435, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5694992542266846, + "rewards/margins": 1.738724708557129, + "rewards/rejected": -1.1692254543304443, "step": 340 }, { - "epoch": 0.36, - "learning_rate": 4.887102946804438e-07, - "logits/chosen": -3.072432279586792, - "logits/rejected": -3.047440528869629, - "logps/chosen": -356.6179504394531, - "logps/rejected": -278.54400634765625, - "loss": 0.537, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.7054955959320068, - "rewards/margins": 1.3675787448883057, - "rewards/rejected": -0.6620832085609436, + "epoch": 0.18, + "learning_rate": 3.006872852233677e-07, + "logits/chosen": -2.994208335876465, + "logits/rejected": -2.974083662033081, + "logps/chosen": -204.75537109375, + "logps/rejected": -272.02471923828125, + "loss": 0.3807, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.33386245369911194, + "rewards/margins": 1.8207858800888062, + "rewards/rejected": -1.4869236946105957, "step": 350 }, { - "epoch": 0.37, - "learning_rate": 4.867967853042479e-07, - "logits/chosen": -3.057285785675049, - "logits/rejected": -3.0351486206054688, - "logps/chosen": -301.95526123046875, - "logps/rejected": -284.12005615234375, - "loss": 0.5121, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.5215950012207031, - "rewards/margins": 1.3286265134811401, - "rewards/rejected": -0.807031512260437, + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.9840168952941895, + "logits/rejected": -3.0060067176818848, + "logps/chosen": -230.21176147460938, + "logps/rejected": -264.5437316894531, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22637569904327393, + "rewards/margins": 1.9646222591400146, + "rewards/rejected": -1.7382465600967407, "step": 360 }, { - "epoch": 0.38, - "learning_rate": 4.84883275928052e-07, - "logits/chosen": -3.037200450897217, - "logits/rejected": -3.044207811355591, - "logps/chosen": -333.80474853515625, - "logps/rejected": -304.6210021972656, - "loss": 0.5898, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.2941075265407562, - "rewards/margins": 1.1119602918624878, - "rewards/rejected": -0.8178526759147644, + "epoch": 0.19, + "learning_rate": 3.178694158075601e-07, + "logits/chosen": -2.993978500366211, + "logits/rejected": -2.9822306632995605, + "logps/chosen": -232.2978057861328, + "logps/rejected": -236.8600616455078, + "loss": 0.4206, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.44667625427246094, + "rewards/margins": 2.0350661277770996, + "rewards/rejected": -1.5883899927139282, "step": 370 }, { - "epoch": 0.39, - "learning_rate": 4.82969766551856e-07, - "logits/chosen": -3.054816484451294, - "logits/rejected": -3.0310795307159424, - "logps/chosen": -367.0569152832031, - "logps/rejected": -301.362060546875, - "loss": 0.5764, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.4226422905921936, - "rewards/margins": 1.428713321685791, - "rewards/rejected": -1.0060709714889526, + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.9275434017181396, + "logits/rejected": -2.952369213104248, + "logps/chosen": -182.41514587402344, + "logps/rejected": -267.49188232421875, + "loss": 0.5245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6680549383163452, + "rewards/margins": 2.175363302230835, + "rewards/rejected": -1.5073082447052002, "step": 380 }, { - "epoch": 0.4, - "learning_rate": 4.810562571756601e-07, - "logits/chosen": -2.941606044769287, - "logits/rejected": -2.9353060722351074, - "logps/chosen": -364.6161193847656, - "logps/rejected": -285.13714599609375, - "loss": 0.5958, - "rewards/accuracies": 0.78125, - "rewards/chosen": 0.369827002286911, - "rewards/margins": 1.4904773235321045, - "rewards/rejected": -1.1206501722335815, + "epoch": 0.2, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -3.0049033164978027, + "logits/rejected": -3.054906129837036, + "logps/chosen": -211.040771484375, + "logps/rejected": -257.2097473144531, + "loss": 0.4087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5466452836990356, + "rewards/margins": 2.268761157989502, + "rewards/rejected": -1.7221157550811768, "step": 390 }, { - "epoch": 0.41, - "learning_rate": 4.791427477994642e-07, - "logits/chosen": -2.9758362770080566, - "logits/rejected": -2.9436521530151367, - "logps/chosen": -341.735107421875, - "logps/rejected": -297.5789489746094, - "loss": 0.5809, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.364463746547699, - "rewards/margins": 1.448970913887024, - "rewards/rejected": -1.0845072269439697, + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.062668561935425, + "logits/rejected": -3.073979616165161, + "logps/chosen": -220.09646606445312, + "logps/rejected": -228.078125, + "loss": 0.5348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.29860109090805054, + "rewards/margins": 1.6728731393814087, + "rewards/rejected": -1.374272108078003, "step": 400 }, { - "epoch": 0.41, - "eval_logits/chosen": -2.9359750747680664, - "eval_logits/rejected": -2.9205198287963867, - "eval_logps/chosen": -349.02520751953125, - "eval_logps/rejected": -293.040771484375, - "eval_loss": 0.5436112284660339, - "eval_rewards/accuracies": 0.7599999904632568, - "eval_rewards/chosen": 0.38406580686569214, - "eval_rewards/margins": 1.2762165069580078, - "eval_rewards/rejected": -0.89215087890625, - "eval_runtime": 499.6663, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 0.21, + "eval_logits/chosen": -2.956052780151367, + "eval_logits/rejected": -3.00534725189209, + "eval_logps/chosen": -244.32916259765625, + "eval_logps/rejected": -293.5365295410156, + "eval_loss": 0.47088953852653503, + "eval_rewards/accuracies": 0.8040000200271606, + "eval_rewards/chosen": 0.3350996673107147, + "eval_rewards/margins": 2.075010299682617, + "eval_rewards/rejected": -1.7399104833602905, + "eval_runtime": 278.6522, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.449, "step": 400 }, { - "epoch": 0.42, - "learning_rate": 4.772292384232682e-07, - "logits/chosen": -2.952409029006958, - "logits/rejected": -2.9130635261535645, - "logps/chosen": -338.5810546875, - "logps/rejected": -306.2530517578125, - "loss": 0.746, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.08780597150325775, - "rewards/margins": 0.7161486148834229, - "rewards/rejected": -0.8039544820785522, + "epoch": 0.21, + "learning_rate": 3.5223367697594503e-07, + "logits/chosen": -2.9030888080596924, + "logits/rejected": -2.903963804244995, + "logps/chosen": -218.80093383789062, + "logps/rejected": -270.7477111816406, + "loss": 0.3946, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.5591822862625122, + "rewards/margins": 2.3011364936828613, + "rewards/rejected": -1.7419544458389282, "step": 410 }, { - "epoch": 0.43, - "learning_rate": 4.753157290470723e-07, - "logits/chosen": -2.995166301727295, - "logits/rejected": -2.9763875007629395, - "logps/chosen": -342.8363342285156, - "logps/rejected": -276.787353515625, - "loss": 0.561, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.18220452964305878, - "rewards/margins": 0.9985648989677429, - "rewards/rejected": -0.816360354423523, + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.9065468311309814, + "logits/rejected": -2.909379482269287, + "logps/chosen": -281.303466796875, + "logps/rejected": -283.3905029296875, + "loss": 0.4069, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.30885323882102966, + "rewards/margins": 2.3154749870300293, + "rewards/rejected": -2.006621837615967, "step": 420 }, { - "epoch": 0.44, - "learning_rate": 4.7340221967087635e-07, - "logits/chosen": -3.0134756565093994, - "logits/rejected": -3.0398764610290527, - "logps/chosen": -321.25738525390625, - "logps/rejected": -267.5379943847656, - "loss": 0.5616, + "epoch": 0.22, + "learning_rate": 3.6941580756013745e-07, + "logits/chosen": -3.0179543495178223, + "logits/rejected": -2.9964187145233154, + "logps/chosen": -246.273193359375, + "logps/rejected": -244.64932250976562, + "loss": 0.4263, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.4658970236778259, - "rewards/margins": 1.3995509147644043, - "rewards/rejected": -0.9336539506912231, + "rewards/chosen": -0.045947521924972534, + "rewards/margins": 1.8498051166534424, + "rewards/rejected": -1.8957529067993164, "step": 430 }, { - "epoch": 0.45, - "learning_rate": 4.714887102946804e-07, - "logits/chosen": -3.0466179847717285, - "logits/rejected": -3.0337159633636475, - "logps/chosen": -326.5643005371094, - "logps/rejected": -267.01751708984375, - "loss": 0.5292, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.3320990800857544, - "rewards/margins": 1.2292256355285645, - "rewards/rejected": -0.8971264958381653, + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.9630212783813477, + "logits/rejected": -3.0045969486236572, + "logps/chosen": -257.0661926269531, + "logps/rejected": -351.8866271972656, + "loss": 0.4483, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.05867958813905716, + "rewards/margins": 1.8917491436004639, + "rewards/rejected": -1.8330695629119873, "step": 440 }, { - "epoch": 0.46, - "learning_rate": 4.6957520091848447e-07, - "logits/chosen": -3.0742642879486084, - "logits/rejected": -3.0417568683624268, - "logps/chosen": -298.84161376953125, - "logps/rejected": -281.374267578125, - "loss": 0.5816, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.23377075791358948, - "rewards/margins": 0.9984762072563171, - "rewards/rejected": -0.76470547914505, + "epoch": 0.23, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -3.0662550926208496, + "logits/rejected": -3.0640132427215576, + "logps/chosen": -228.62112426757812, + "logps/rejected": -314.25103759765625, + "loss": 0.4252, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3938661217689514, + "rewards/margins": 1.8685601949691772, + "rewards/rejected": -1.474694013595581, "step": 450 }, { - "epoch": 0.48, - "learning_rate": 4.6766169154228853e-07, - "logits/chosen": -3.01784348487854, - "logits/rejected": -3.042041540145874, - "logps/chosen": -331.4616394042969, - "logps/rejected": -285.1472473144531, - "loss": 0.557, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.6183053851127625, - "rewards/margins": 1.2917571067810059, - "rewards/rejected": -0.6734516620635986, + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -3.0231873989105225, + "logits/rejected": -3.001800775527954, + "logps/chosen": -251.1888885498047, + "logps/rejected": -312.30120849609375, + "loss": 0.5593, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4971710741519928, + "rewards/margins": 2.303713321685791, + "rewards/rejected": -1.8065423965454102, "step": 460 }, { - "epoch": 0.49, - "learning_rate": 4.657481821660926e-07, - "logits/chosen": -3.0028042793273926, - "logits/rejected": -2.9753506183624268, - "logps/chosen": -312.3541564941406, - "logps/rejected": -250.565673828125, - "loss": 0.5482, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.26564690470695496, - "rewards/margins": 1.1742582321166992, - "rewards/rejected": -0.9086114168167114, + "epoch": 0.24, + "learning_rate": 4.037800687285223e-07, + "logits/chosen": -3.061251401901245, + "logits/rejected": -3.065091848373413, + "logps/chosen": -299.6525573730469, + "logps/rejected": -242.2359619140625, + "loss": 0.4287, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.43876272439956665, + "rewards/margins": 2.200730085372925, + "rewards/rejected": -1.761967420578003, "step": 470 }, { - "epoch": 0.5, - "learning_rate": 4.6383467278989666e-07, - "logits/chosen": -3.0283827781677246, - "logits/rejected": -3.046517848968506, - "logps/chosen": -315.6674499511719, - "logps/rejected": -280.6795349121094, - "loss": 0.5289, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.547839343547821, - "rewards/margins": 1.7104412317276, - "rewards/rejected": -1.1626019477844238, + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.0854439735412598, + "logits/rejected": -3.067539691925049, + "logps/chosen": -296.04217529296875, + "logps/rejected": -291.0183410644531, + "loss": 0.388, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09810693562030792, + "rewards/margins": 1.5580971240997314, + "rewards/rejected": -1.4599902629852295, "step": 480 }, { - "epoch": 0.51, - "learning_rate": 4.6192116341370067e-07, - "logits/chosen": -2.985366106033325, - "logits/rejected": -2.969849109649658, - "logps/chosen": -330.6661682128906, - "logps/rejected": -275.83099365234375, - "loss": 0.5242, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": 0.4285426139831543, - "rewards/margins": 1.5095882415771484, - "rewards/rejected": -1.0810457468032837, + "epoch": 0.25, + "learning_rate": 4.209621993127148e-07, + "logits/chosen": -2.9323372840881348, + "logits/rejected": -2.976844310760498, + "logps/chosen": -282.63629150390625, + "logps/rejected": -290.7026062011719, + "loss": 0.4349, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22253699600696564, + "rewards/margins": 1.85140061378479, + "rewards/rejected": -1.6288635730743408, "step": 490 }, { - "epoch": 0.52, - "learning_rate": 4.6000765403750473e-07, - "logits/chosen": -3.0527420043945312, - "logits/rejected": -3.0406336784362793, - "logps/chosen": -299.01055908203125, - "logps/rejected": -242.08358764648438, - "loss": 0.5164, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.38268548250198364, - "rewards/margins": 1.2666680812835693, - "rewards/rejected": -0.8839825391769409, + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.1376662254333496, + "logits/rejected": -3.1647543907165527, + "logps/chosen": -230.1941375732422, + "logps/rejected": -306.46929931640625, + "loss": 0.4742, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3802974820137024, + "rewards/margins": 2.1001405715942383, + "rewards/rejected": -1.7198429107666016, "step": 500 }, { - "epoch": 0.52, - "eval_logits/chosen": -3.015965223312378, - "eval_logits/rejected": -3.0117154121398926, - "eval_logps/chosen": -349.8603210449219, - "eval_logps/rejected": -295.5238952636719, - "eval_loss": 0.5405718088150024, - "eval_rewards/accuracies": 0.765999972820282, - "eval_rewards/chosen": 0.3005577325820923, - "eval_rewards/margins": 1.4410209655761719, - "eval_rewards/rejected": -1.1404632329940796, - "eval_runtime": 499.3309, - "eval_samples_per_second": 4.005, - "eval_steps_per_second": 0.501, + "epoch": 0.26, + "eval_logits/chosen": -3.0499820709228516, + "eval_logits/rejected": -3.1011428833007812, + "eval_logps/chosen": -243.72792053222656, + "eval_logps/rejected": -294.0814208984375, + "eval_loss": 0.5065268874168396, + "eval_rewards/accuracies": 0.8220000267028809, + "eval_rewards/chosen": 0.39522290229797363, + "eval_rewards/margins": 2.1896207332611084, + "eval_rewards/rejected": -1.7943980693817139, + "eval_runtime": 278.4037, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.449, "step": 500 }, { - "epoch": 0.53, - "learning_rate": 4.580941446613088e-07, - "logits/chosen": -3.002946376800537, - "logits/rejected": -2.97822642326355, - "logps/chosen": -320.47601318359375, - "logps/rejected": -279.7250671386719, - "loss": 0.5376, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.21459856629371643, - "rewards/margins": 1.4735734462738037, - "rewards/rejected": -1.2589749097824097, + "epoch": 0.26, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -3.00343918800354, + "logits/rejected": -3.095369815826416, + "logps/chosen": -258.6968994140625, + "logps/rejected": -292.36102294921875, + "loss": 0.573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.032131295651197433, + "rewards/margins": 1.5518535375595093, + "rewards/rejected": -1.5197222232818604, "step": 510 }, { - "epoch": 0.54, - "learning_rate": 4.5618063528511285e-07, - "logits/chosen": -3.031226634979248, - "logits/rejected": -3.0124049186706543, - "logps/chosen": -306.7613830566406, - "logps/rejected": -264.15411376953125, - "loss": 0.5679, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.32867926359176636, - "rewards/margins": 1.4985711574554443, - "rewards/rejected": -1.1698918342590332, + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -3.037055730819702, + "logits/rejected": -3.0623817443847656, + "logps/chosen": -223.7424774169922, + "logps/rejected": -250.5542449951172, + "loss": 0.5577, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20696468651294708, + "rewards/margins": 1.9794868230819702, + "rewards/rejected": -2.1864516735076904, "step": 520 }, { - "epoch": 0.55, - "learning_rate": 4.542671259089169e-07, - "logits/chosen": -2.965167284011841, - "logits/rejected": -2.938575267791748, - "logps/chosen": -300.1754455566406, - "logps/rejected": -275.5013427734375, - "loss": 0.5973, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.07723500579595566, - "rewards/margins": 0.9571349024772644, - "rewards/rejected": -1.034369945526123, + "epoch": 0.27, + "learning_rate": 4.5532646048109964e-07, + "logits/chosen": -3.029510021209717, + "logits/rejected": -3.048191547393799, + "logps/chosen": -239.50729370117188, + "logps/rejected": -269.36920166015625, + "loss": 0.4672, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13861794769763947, + "rewards/margins": 2.154052257537842, + "rewards/rejected": -2.0154342651367188, "step": 530 }, { - "epoch": 0.56, - "learning_rate": 4.52353616532721e-07, - "logits/chosen": -3.0364766120910645, - "logits/rejected": -3.031278610229492, - "logps/chosen": -347.5738220214844, - "logps/rejected": -286.0475769042969, - "loss": 0.559, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.6112526059150696, - "rewards/margins": 1.4097859859466553, - "rewards/rejected": -0.79853355884552, + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.0581881999969482, + "logits/rejected": -3.0828652381896973, + "logps/chosen": -284.1593933105469, + "logps/rejected": -302.8993225097656, + "loss": 0.5169, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.19171719253063202, + "rewards/margins": 1.9804853200912476, + "rewards/rejected": -1.7887680530548096, "step": 540 }, { - "epoch": 0.57, - "learning_rate": 4.5044010715652504e-07, - "logits/chosen": -2.958436965942383, - "logits/rejected": -2.957434892654419, - "logps/chosen": -339.9021301269531, - "logps/rejected": -298.78729248046875, - "loss": 0.535, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.24418941140174866, - "rewards/margins": 1.1881563663482666, - "rewards/rejected": -0.9439669847488403, + "epoch": 0.28, + "learning_rate": 4.7250859106529206e-07, + "logits/chosen": -3.065326452255249, + "logits/rejected": -3.0628104209899902, + "logps/chosen": -265.96173095703125, + "logps/rejected": -304.02923583984375, + "loss": 0.5727, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.19634675979614258, + "rewards/margins": 2.1012516021728516, + "rewards/rejected": -1.9049047231674194, "step": 550 }, { - "epoch": 0.58, - "learning_rate": 4.485265977803291e-07, - "logits/chosen": -2.9755353927612305, - "logits/rejected": -2.9734432697296143, - "logps/chosen": -338.18255615234375, - "logps/rejected": -294.8335876464844, - "loss": 0.5625, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.1414714902639389, - "rewards/margins": 1.3819502592086792, - "rewards/rejected": -1.240478754043579, + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.045553684234619, + "logits/rejected": -3.0148532390594482, + "logps/chosen": -308.58282470703125, + "logps/rejected": -330.0618896484375, + "loss": 0.4386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.14334824681282043, + "rewards/margins": 2.13740873336792, + "rewards/rejected": -1.994060754776001, "step": 560 }, { - "epoch": 0.59, - "learning_rate": 4.4661308840413316e-07, - "logits/chosen": -2.8903489112854004, - "logits/rejected": -2.886591672897339, - "logps/chosen": -337.5680236816406, - "logps/rejected": -308.0293884277344, - "loss": 0.5135, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.011459055356681347, - "rewards/margins": 1.3413360118865967, - "rewards/rejected": -1.3527950048446655, + "epoch": 0.29, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -3.091966152191162, + "logits/rejected": -3.092928886413574, + "logps/chosen": -207.5926513671875, + "logps/rejected": -312.8612365722656, + "loss": 0.4463, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2711809277534485, + "rewards/margins": 2.29288911819458, + "rewards/rejected": -2.0217084884643555, "step": 570 }, { - "epoch": 0.6, - "learning_rate": 4.446995790279372e-07, - "logits/chosen": -2.918245792388916, - "logits/rejected": -2.9179019927978516, - "logps/chosen": -331.99859619140625, - "logps/rejected": -315.8232421875, - "loss": 0.5671, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.17509713768959045, - "rewards/margins": 1.5391473770141602, - "rewards/rejected": -1.3640501499176025, - "step": 580 - }, + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.1156067848205566, + "logits/rejected": -3.081942081451416, + "logps/chosen": -261.8205871582031, + "logps/rejected": -246.6261444091797, + "loss": 0.5618, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.352669894695282, + "rewards/margins": 2.471661329269409, + "rewards/rejected": -2.1189913749694824, + "step": 580 + }, { - "epoch": 0.61, - "learning_rate": 4.4278606965174123e-07, - "logits/chosen": -2.947676420211792, - "logits/rejected": -2.9416377544403076, - "logps/chosen": -324.63861083984375, - "logps/rejected": -275.0785217285156, - "loss": 0.5666, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.015921253710985184, - "rewards/margins": 1.2579394578933716, - "rewards/rejected": -1.242018222808838, + "epoch": 0.3, + "learning_rate": 4.992350353796136e-07, + "logits/chosen": -3.011913776397705, + "logits/rejected": -3.0575170516967773, + "logps/chosen": -221.0374298095703, + "logps/rejected": -290.2525634765625, + "loss": 0.4376, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.34248480200767517, + "rewards/margins": 2.1471829414367676, + "rewards/rejected": -1.8046982288360596, "step": 590 }, { - "epoch": 0.62, - "learning_rate": 4.408725602755453e-07, - "logits/chosen": -2.9947569370269775, - "logits/rejected": -2.9961860179901123, - "logps/chosen": -284.56951904296875, - "logps/rejected": -242.5293426513672, - "loss": 0.5957, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.21141913533210754, - "rewards/margins": 0.8547714948654175, - "rewards/rejected": -1.066190481185913, + "epoch": 0.31, + "learning_rate": 4.982788296041308e-07, + "logits/chosen": -3.086638927459717, + "logits/rejected": -3.0424036979675293, + "logps/chosen": -224.4527587890625, + "logps/rejected": -277.10101318359375, + "loss": 0.6062, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10577313601970673, + "rewards/margins": 2.2690727710723877, + "rewards/rejected": -2.3748459815979004, "step": 600 }, { - "epoch": 0.62, - "eval_logits/chosen": -2.9458935260772705, - "eval_logits/rejected": -2.935307025909424, - "eval_logps/chosen": -350.91400146484375, - "eval_logps/rejected": -294.41485595703125, - "eval_loss": 0.5336324572563171, - "eval_rewards/accuracies": 0.7379999756813049, - "eval_rewards/chosen": 0.1951846182346344, - "eval_rewards/margins": 1.224743366241455, - "eval_rewards/rejected": -1.0295586585998535, - "eval_runtime": 500.1685, - "eval_samples_per_second": 3.999, - "eval_steps_per_second": 0.5, + "epoch": 0.31, + "eval_logits/chosen": -2.973641872406006, + "eval_logits/rejected": -3.0394279956817627, + "eval_logps/chosen": -243.62783813476562, + "eval_logps/rejected": -295.17205810546875, + "eval_loss": 0.45028701424598694, + "eval_rewards/accuracies": 0.7979999780654907, + "eval_rewards/chosen": 0.40523144602775574, + "eval_rewards/margins": 2.3086962699890137, + "eval_rewards/rejected": -1.903464913368225, + "eval_runtime": 278.8163, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 0.448, "step": 600 }, { - "epoch": 0.63, - "learning_rate": 4.3895905089934936e-07, - "logits/chosen": -2.939690113067627, - "logits/rejected": -2.9660871028900146, - "logps/chosen": -327.99853515625, - "logps/rejected": -304.2380065917969, - "loss": 0.5408, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.16417217254638672, - "rewards/margins": 1.4821269512176514, - "rewards/rejected": -1.3179547786712646, + "epoch": 0.31, + "learning_rate": 4.973226238286479e-07, + "logits/chosen": -3.030365228652954, + "logits/rejected": -3.0040786266326904, + "logps/chosen": -311.5085754394531, + "logps/rejected": -328.2323913574219, + "loss": 0.4114, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.8323566317558289, + "rewards/margins": 2.6754965782165527, + "rewards/rejected": -1.8431400060653687, "step": 610 }, { - "epoch": 0.64, - "learning_rate": 4.370455415231534e-07, - "logits/chosen": -2.9834229946136475, - "logits/rejected": -2.985764980316162, - "logps/chosen": -375.3091125488281, - "logps/rejected": -344.99591064453125, - "loss": 0.5471, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.2328425645828247, - "rewards/margins": 1.4427080154418945, - "rewards/rejected": -1.2098654508590698, + "epoch": 0.32, + "learning_rate": 4.96366418053165e-07, + "logits/chosen": -3.1455302238464355, + "logits/rejected": -3.1191396713256836, + "logps/chosen": -239.08175659179688, + "logps/rejected": -321.47772216796875, + "loss": 0.4984, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.34067609906196594, + "rewards/margins": 1.8430697917938232, + "rewards/rejected": -1.5023938417434692, "step": 620 }, { - "epoch": 0.65, - "learning_rate": 4.351320321469575e-07, - "logits/chosen": -2.9586803913116455, - "logits/rejected": -2.9774413108825684, - "logps/chosen": -365.9512023925781, - "logps/rejected": -294.7960205078125, - "loss": 0.6138, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.1919662207365036, - "rewards/margins": 1.4944833517074585, - "rewards/rejected": -1.3025171756744385, + "epoch": 0.33, + "learning_rate": 4.954102122776821e-07, + "logits/chosen": -3.211892604827881, + "logits/rejected": -3.096082925796509, + "logps/chosen": -221.83352661132812, + "logps/rejected": -225.68270874023438, + "loss": 0.3615, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5400357842445374, + "rewards/margins": 2.6193814277648926, + "rewards/rejected": -2.079345464706421, "step": 630 }, { - "epoch": 0.66, - "learning_rate": 4.3321852277076154e-07, - "logits/chosen": -3.0369629859924316, - "logits/rejected": -3.0328054428100586, - "logps/chosen": -342.85235595703125, - "logps/rejected": -305.83343505859375, - "loss": 0.5368, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.0010919570922851562, - "rewards/margins": 1.265241265296936, - "rewards/rejected": -1.2663332223892212, + "epoch": 0.33, + "learning_rate": 4.944540065021993e-07, + "logits/chosen": -2.9544904232025146, + "logits/rejected": -3.0490562915802, + "logps/chosen": -211.2279510498047, + "logps/rejected": -266.6455078125, + "loss": 0.5134, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1349196881055832, + "rewards/margins": 2.3002161979675293, + "rewards/rejected": -2.1652963161468506, "step": 640 }, { - "epoch": 0.67, - "learning_rate": 4.313050133945656e-07, - "logits/chosen": -3.0272622108459473, - "logits/rejected": -3.0403847694396973, - "logps/chosen": -333.6284484863281, - "logps/rejected": -284.5811462402344, - "loss": 0.5111, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.32994264364242554, - "rewards/margins": 1.4819453954696655, - "rewards/rejected": -1.1520028114318848, + "epoch": 0.34, + "learning_rate": 4.934978007267163e-07, + "logits/chosen": -3.1061511039733887, + "logits/rejected": -3.083832263946533, + "logps/chosen": -229.7516632080078, + "logps/rejected": -296.2362060546875, + "loss": 0.4771, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.3486045002937317, + "rewards/margins": 2.3736672401428223, + "rewards/rejected": -2.0250627994537354, "step": 650 }, { - "epoch": 0.68, - "learning_rate": 4.2939150401836967e-07, - "logits/chosen": -3.0416343212127686, - "logits/rejected": -3.047837734222412, - "logps/chosen": -329.989990234375, - "logps/rejected": -305.66046142578125, - "loss": 0.5253, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.07132011651992798, - "rewards/margins": 1.4362289905548096, - "rewards/rejected": -1.3649089336395264, + "epoch": 0.34, + "learning_rate": 4.925415949512335e-07, + "logits/chosen": -3.0613837242126465, + "logits/rejected": -3.051008701324463, + "logps/chosen": -301.8056640625, + "logps/rejected": -314.62066650390625, + "loss": 0.4375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7906830310821533, + "rewards/margins": 3.2094345092773438, + "rewards/rejected": -2.4187512397766113, "step": 660 }, { - "epoch": 0.69, - "learning_rate": 4.2747799464217373e-07, - "logits/chosen": -2.998131036758423, - "logits/rejected": -2.992091417312622, - "logps/chosen": -378.5470275878906, - "logps/rejected": -282.40570068359375, - "loss": 0.5477, + "epoch": 0.35, + "learning_rate": 4.915853891757506e-07, + "logits/chosen": -3.042600154876709, + "logits/rejected": -3.056886672973633, + "logps/chosen": -180.24853515625, + "logps/rejected": -310.36590576171875, + "loss": 0.5875, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.08499707281589508, - "rewards/margins": 1.1583983898162842, - "rewards/rejected": -1.0734012126922607, + "rewards/chosen": -0.13172228634357452, + "rewards/margins": 2.5716919898986816, + "rewards/rejected": -2.70341420173645, "step": 670 }, { - "epoch": 0.7, - "learning_rate": 4.255644852659778e-07, - "logits/chosen": -2.9488022327423096, - "logits/rejected": -2.993544816970825, - "logps/chosen": -360.3157653808594, - "logps/rejected": -310.53826904296875, - "loss": 0.6413, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.2993057370185852, - "rewards/margins": 1.025062918663025, - "rewards/rejected": -0.7257571220397949, + "epoch": 0.35, + "learning_rate": 4.906291834002677e-07, + "logits/chosen": -3.0984647274017334, + "logits/rejected": -3.095193386077881, + "logps/chosen": -238.5060577392578, + "logps/rejected": -300.75897216796875, + "loss": 0.4938, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.11066919565200806, + "rewards/margins": 2.522183656692505, + "rewards/rejected": -2.4115145206451416, "step": 680 }, { - "epoch": 0.71, - "learning_rate": 4.236509758897818e-07, - "logits/chosen": -2.9860472679138184, - "logits/rejected": -3.0196166038513184, - "logps/chosen": -314.6225280761719, - "logps/rejected": -263.229248046875, - "loss": 0.5575, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.18419621884822845, - "rewards/margins": 1.147241473197937, - "rewards/rejected": -0.9630452990531921, + "epoch": 0.36, + "learning_rate": 4.896729776247848e-07, + "logits/chosen": -3.10922908782959, + "logits/rejected": -3.1270406246185303, + "logps/chosen": -289.7776184082031, + "logps/rejected": -288.74310302734375, + "loss": 0.5012, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.14315786957740784, + "rewards/margins": 2.4653332233428955, + "rewards/rejected": -2.3221755027770996, "step": 690 }, { - "epoch": 0.72, - "learning_rate": 4.2173746651358586e-07, - "logits/chosen": -2.97763729095459, - "logits/rejected": -2.972525119781494, - "logps/chosen": -351.1864013671875, - "logps/rejected": -286.4507751464844, - "loss": 0.6516, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 1.0068714800581802e-05, - "rewards/margins": 0.9209893345832825, - "rewards/rejected": -0.9209792017936707, + "epoch": 0.36, + "learning_rate": 4.88716771849302e-07, + "logits/chosen": -3.106825351715088, + "logits/rejected": -3.0666584968566895, + "logps/chosen": -318.9488525390625, + "logps/rejected": -314.478759765625, + "loss": 0.4228, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.45823708176612854, + "rewards/margins": 2.8160953521728516, + "rewards/rejected": -2.357858180999756, "step": 700 }, { - "epoch": 0.72, - "eval_logits/chosen": -2.985739231109619, - "eval_logits/rejected": -2.9889140129089355, - "eval_logps/chosen": -350.6415710449219, - "eval_logps/rejected": -294.66180419921875, - "eval_loss": 0.5292132496833801, - "eval_rewards/accuracies": 0.7519999742507935, - "eval_rewards/chosen": 0.22242723405361176, - "eval_rewards/margins": 1.2766809463500977, - "eval_rewards/rejected": -1.0542538166046143, - "eval_runtime": 499.164, - "eval_samples_per_second": 4.007, - "eval_steps_per_second": 0.501, + "epoch": 0.36, + "eval_logits/chosen": -2.9973301887512207, + "eval_logits/rejected": -3.065870523452759, + "eval_logps/chosen": -248.1629180908203, + "eval_logps/rejected": -302.49688720703125, + "eval_loss": 0.5025976896286011, + "eval_rewards/accuracies": 0.8199999928474426, + "eval_rewards/chosen": -0.04827720671892166, + "eval_rewards/margins": 2.5876684188842773, + "eval_rewards/rejected": -2.635946035385132, + "eval_runtime": 278.7128, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.448, "step": 700 }, { - "epoch": 0.73, - "learning_rate": 4.198239571373899e-07, - "logits/chosen": -2.9942946434020996, - "logits/rejected": -2.9884631633758545, - "logps/chosen": -320.36328125, - "logps/rejected": -278.8677062988281, - "loss": 0.6297, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.20690801739692688, - "rewards/margins": 1.0582275390625, - "rewards/rejected": -0.8513194918632507, + "epoch": 0.37, + "learning_rate": 4.87760566073819e-07, + "logits/chosen": -3.006333827972412, + "logits/rejected": -3.0047171115875244, + "logps/chosen": -304.8209533691406, + "logps/rejected": -291.0613708496094, + "loss": 0.4523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8323618173599243, + "rewards/margins": 2.86159610748291, + "rewards/rejected": -2.0292341709136963, "step": 710 }, { - "epoch": 0.74, - "learning_rate": 4.17910447761194e-07, - "logits/chosen": -2.9798600673675537, - "logits/rejected": -2.9808664321899414, - "logps/chosen": -346.0234680175781, - "logps/rejected": -326.8348083496094, - "loss": 0.6079, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.026054318994283676, - "rewards/margins": 1.1562727689743042, - "rewards/rejected": -1.1302186250686646, + "epoch": 0.37, + "learning_rate": 4.868043602983362e-07, + "logits/chosen": -3.0522098541259766, + "logits/rejected": -3.1009223461151123, + "logps/chosen": -246.0616455078125, + "logps/rejected": -343.6761169433594, + "loss": 0.4488, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4445156157016754, + "rewards/margins": 3.416804552078247, + "rewards/rejected": -2.9722886085510254, "step": 720 }, { - "epoch": 0.75, - "learning_rate": 4.1599693838499805e-07, - "logits/chosen": -2.9801442623138428, - "logits/rejected": -2.9823849201202393, - "logps/chosen": -331.5793151855469, - "logps/rejected": -271.0641174316406, - "loss": 0.569, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.46656933426856995, - "rewards/margins": 1.4055955410003662, - "rewards/rejected": -0.9390263557434082, + "epoch": 0.38, + "learning_rate": 4.858481545228533e-07, + "logits/chosen": -3.0612289905548096, + "logits/rejected": -3.070131301879883, + "logps/chosen": -283.8197021484375, + "logps/rejected": -327.98480224609375, + "loss": 0.4436, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.8823187947273254, + "rewards/margins": 3.157557249069214, + "rewards/rejected": -2.2752389907836914, "step": 730 }, { - "epoch": 0.76, - "learning_rate": 4.140834290088021e-07, - "logits/chosen": -2.8921711444854736, - "logits/rejected": -2.893559694290161, - "logps/chosen": -353.5256652832031, - "logps/rejected": -315.2562561035156, - "loss": 0.5244, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.3240983784198761, - "rewards/margins": 1.932403326034546, - "rewards/rejected": -1.6083049774169922, + "epoch": 0.38, + "learning_rate": 4.848919487473704e-07, + "logits/chosen": -2.9994044303894043, + "logits/rejected": -3.0417821407318115, + "logps/chosen": -270.36944580078125, + "logps/rejected": -349.9217224121094, + "loss": 0.6635, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09951668977737427, + "rewards/margins": 2.4409453868865967, + "rewards/rejected": -2.5404622554779053, "step": 740 }, { - "epoch": 0.77, - "learning_rate": 4.121699196326062e-07, - "logits/chosen": -2.909632444381714, - "logits/rejected": -2.9216995239257812, - "logps/chosen": -324.0218200683594, - "logps/rejected": -261.0860900878906, - "loss": 0.515, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.3732963800430298, - "rewards/margins": 1.5582427978515625, - "rewards/rejected": -1.1849465370178223, + "epoch": 0.39, + "learning_rate": 4.839357429718875e-07, + "logits/chosen": -3.0597434043884277, + "logits/rejected": -3.095123767852783, + "logps/chosen": -274.39984130859375, + "logps/rejected": -296.67333984375, + "loss": 0.5506, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.694625973701477, + "rewards/margins": 1.9568843841552734, + "rewards/rejected": -2.651510238647461, "step": 750 }, { - "epoch": 0.78, - "learning_rate": 4.1025641025641024e-07, - "logits/chosen": -2.954899311065674, - "logits/rejected": -2.949449062347412, - "logps/chosen": -320.23431396484375, - "logps/rejected": -323.3485412597656, - "loss": 0.5455, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.34602198004722595, - "rewards/margins": 1.3542587757110596, - "rewards/rejected": -1.0082366466522217, + "epoch": 0.39, + "learning_rate": 4.829795371964047e-07, + "logits/chosen": -3.0193636417388916, + "logits/rejected": -3.0218288898468018, + "logps/chosen": -250.9928436279297, + "logps/rejected": -305.6485900878906, + "loss": 0.4669, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4517936110496521, + "rewards/margins": 2.732727527618408, + "rewards/rejected": -2.2809338569641113, "step": 760 }, { - "epoch": 0.8, - "learning_rate": 4.083429008802143e-07, - "logits/chosen": -2.9397459030151367, - "logits/rejected": -2.958151340484619, - "logps/chosen": -340.77081298828125, - "logps/rejected": -300.03826904296875, - "loss": 0.5168, + "epoch": 0.4, + "learning_rate": 4.820233314209217e-07, + "logits/chosen": -2.859200954437256, + "logits/rejected": -2.8881382942199707, + "logps/chosen": -231.7881622314453, + "logps/rejected": -282.0718078613281, + "loss": 0.7131, "rewards/accuracies": 0.75, - "rewards/chosen": 0.03096119686961174, - "rewards/margins": 1.257548213005066, - "rewards/rejected": -1.226586937904358, + "rewards/chosen": 0.0762915164232254, + "rewards/margins": 2.6063120365142822, + "rewards/rejected": -2.530020236968994, "step": 770 }, { - "epoch": 0.81, - "learning_rate": 4.0642939150401836e-07, - "logits/chosen": -2.9671478271484375, - "logits/rejected": -2.9739999771118164, - "logps/chosen": -326.27630615234375, - "logps/rejected": -293.6321716308594, - "loss": 0.5292, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.1696806699037552, - "rewards/margins": 1.2981321811676025, - "rewards/rejected": -1.1284515857696533, + "epoch": 0.4, + "learning_rate": 4.810671256454389e-07, + "logits/chosen": -2.876128673553467, + "logits/rejected": -2.917257070541382, + "logps/chosen": -291.61676025390625, + "logps/rejected": -338.470703125, + "loss": 0.4415, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1621725857257843, + "rewards/margins": 2.5127065181732178, + "rewards/rejected": -2.350533962249756, "step": 780 }, { - "epoch": 0.82, - "learning_rate": 4.0451588212782237e-07, - "logits/chosen": -2.975926637649536, - "logits/rejected": -2.988398551940918, - "logps/chosen": -306.8700256347656, - "logps/rejected": -270.39520263671875, - "loss": 0.5052, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.08689726889133453, - "rewards/margins": 1.731827735900879, - "rewards/rejected": -1.6449304819107056, + "epoch": 0.41, + "learning_rate": 4.80110919869956e-07, + "logits/chosen": -2.9246349334716797, + "logits/rejected": -2.957263231277466, + "logps/chosen": -266.44879150390625, + "logps/rejected": -268.9879150390625, + "loss": 0.4671, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4171672761440277, + "rewards/margins": 2.7636542320251465, + "rewards/rejected": -2.346486806869507, "step": 790 }, { - "epoch": 0.83, - "learning_rate": 4.0260237275162643e-07, - "logits/chosen": -2.912811279296875, - "logits/rejected": -2.917335271835327, - "logps/chosen": -326.623291015625, - "logps/rejected": -297.1636047363281, - "loss": 0.5353, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.33763498067855835, - "rewards/margins": 1.476910948753357, - "rewards/rejected": -1.1392759084701538, + "epoch": 0.41, + "learning_rate": 4.791547140944731e-07, + "logits/chosen": -2.82590651512146, + "logits/rejected": -2.85862398147583, + "logps/chosen": -220.08151245117188, + "logps/rejected": -262.57232666015625, + "loss": 0.5396, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.6729877591133118, + "rewards/margins": 2.703979253768921, + "rewards/rejected": -2.030991792678833, "step": 800 }, { - "epoch": 0.83, - "eval_logits/chosen": -2.9171199798583984, - "eval_logits/rejected": -2.9115679264068604, - "eval_logps/chosen": -351.76287841796875, - "eval_logps/rejected": -298.2225341796875, - "eval_loss": 0.5144525766372681, - "eval_rewards/accuracies": 0.7559999823570251, - "eval_rewards/chosen": 0.11029549688100815, - "eval_rewards/margins": 1.5206220149993896, - "eval_rewards/rejected": -1.4103264808654785, - "eval_runtime": 500.3685, - "eval_samples_per_second": 3.997, - "eval_steps_per_second": 0.5, + "epoch": 0.41, + "eval_logits/chosen": -2.855961561203003, + "eval_logits/rejected": -2.910541296005249, + "eval_logps/chosen": -242.5603485107422, + "eval_logps/rejected": -296.4591979980469, + "eval_loss": 0.4615330398082733, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": 0.5119800567626953, + "eval_rewards/margins": 2.544154405593872, + "eval_rewards/rejected": -2.0321743488311768, + "eval_runtime": 279.1099, + "eval_samples_per_second": 7.166, + "eval_steps_per_second": 0.448, "step": 800 }, { - "epoch": 0.84, - "learning_rate": 4.006888633754305e-07, - "logits/chosen": -2.9584765434265137, - "logits/rejected": -2.9493420124053955, - "logps/chosen": -344.96832275390625, - "logps/rejected": -291.515380859375, - "loss": 0.4904, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.19502875208854675, - "rewards/margins": 1.6929543018341064, - "rewards/rejected": -1.4979256391525269, + "epoch": 0.42, + "learning_rate": 4.781985083189902e-07, + "logits/chosen": -2.9013190269470215, + "logits/rejected": -2.9036900997161865, + "logps/chosen": -205.20681762695312, + "logps/rejected": -307.65447998046875, + "loss": 0.5874, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.207598015666008, + "rewards/margins": 2.063453197479248, + "rewards/rejected": -1.8558553457260132, "step": 810 }, { - "epoch": 0.85, - "learning_rate": 3.9877535399923456e-07, - "logits/chosen": -2.965297222137451, - "logits/rejected": -2.9511642456054688, - "logps/chosen": -325.5323181152344, - "logps/rejected": -296.2393493652344, - "loss": 0.5431, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.10147881507873535, - "rewards/margins": 1.3891279697418213, - "rewards/rejected": -1.490606665611267, + "epoch": 0.42, + "learning_rate": 4.772423025435074e-07, + "logits/chosen": -2.8757100105285645, + "logits/rejected": -2.8260178565979004, + "logps/chosen": -298.41424560546875, + "logps/rejected": -329.42718505859375, + "loss": 1.0314, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4544452130794525, + "rewards/margins": 1.7502931356430054, + "rewards/rejected": -1.2958478927612305, "step": 820 }, { - "epoch": 0.86, - "learning_rate": 3.968618446230386e-07, - "logits/chosen": -2.9831089973449707, - "logits/rejected": -2.988027572631836, - "logps/chosen": -365.47564697265625, - "logps/rejected": -278.7037048339844, - "loss": 0.556, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": 0.29974445700645447, - "rewards/margins": 1.7932666540145874, - "rewards/rejected": -1.4935224056243896, + "epoch": 0.43, + "learning_rate": 4.762860967680244e-07, + "logits/chosen": -2.871217727661133, + "logits/rejected": -2.9765567779541016, + "logps/chosen": -175.9529571533203, + "logps/rejected": -234.138427734375, + "loss": 0.7217, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.6448418498039246, + "rewards/margins": 2.582141876220703, + "rewards/rejected": -1.9372999668121338, "step": 830 }, { - "epoch": 0.87, - "learning_rate": 3.949483352468427e-07, - "logits/chosen": -2.9965195655822754, - "logits/rejected": -3.0119361877441406, - "logps/chosen": -310.4830627441406, - "logps/rejected": -278.9137878417969, - "loss": 0.5699, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.004600034561008215, - "rewards/margins": 1.4109219312667847, - "rewards/rejected": -1.4155219793319702, + "epoch": 0.43, + "learning_rate": 4.7532989099254154e-07, + "logits/chosen": -2.862799882888794, + "logits/rejected": -2.8322913646698, + "logps/chosen": -224.6620635986328, + "logps/rejected": -297.45562744140625, + "loss": 0.4842, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.3813169598579407, + "rewards/margins": 2.5376696586608887, + "rewards/rejected": -2.1563527584075928, "step": 840 }, { - "epoch": 0.88, - "learning_rate": 3.9303482587064674e-07, - "logits/chosen": -3.0153908729553223, - "logits/rejected": -3.0218112468719482, - "logps/chosen": -369.01336669921875, - "logps/rejected": -302.21661376953125, - "loss": 0.5626, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.07364175468683243, - "rewards/margins": 1.458538293838501, - "rewards/rejected": -1.3848967552185059, + "epoch": 0.44, + "learning_rate": 4.7437368521705866e-07, + "logits/chosen": -2.974093437194824, + "logits/rejected": -2.984872579574585, + "logps/chosen": -197.2064208984375, + "logps/rejected": -292.5155944824219, + "loss": 0.5179, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.9365224838256836, + "rewards/margins": 3.0235087871551514, + "rewards/rejected": -2.0869860649108887, "step": 850 }, { - "epoch": 0.89, - "learning_rate": 3.911213164944508e-07, - "logits/chosen": -2.9755771160125732, - "logits/rejected": -2.9671168327331543, - "logps/chosen": -360.69061279296875, - "logps/rejected": -291.0027160644531, - "loss": 0.5702, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.19498476386070251, - "rewards/margins": 1.1925767660140991, - "rewards/rejected": -1.3875614404678345, + "epoch": 0.44, + "learning_rate": 4.7341747944157577e-07, + "logits/chosen": -2.951917886734009, + "logits/rejected": -2.975085735321045, + "logps/chosen": -277.63653564453125, + "logps/rejected": -305.8817138671875, + "loss": 0.4853, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.252098023891449, + "rewards/margins": 2.036212205886841, + "rewards/rejected": -1.7841142416000366, "step": 860 }, { - "epoch": 0.9, - "learning_rate": 3.8920780711825487e-07, - "logits/chosen": -3.0124592781066895, - "logits/rejected": -3.0058891773223877, - "logps/chosen": -362.83624267578125, - "logps/rejected": -305.2490539550781, - "loss": 0.5918, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.20124521851539612, - "rewards/margins": 1.2762418985366821, - "rewards/rejected": -1.0749967098236084, + "epoch": 0.45, + "learning_rate": 4.724612736660929e-07, + "logits/chosen": -2.8899247646331787, + "logits/rejected": -2.867297887802124, + "logps/chosen": -254.412353515625, + "logps/rejected": -274.4820861816406, + "loss": 0.4448, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6464666128158569, + "rewards/margins": 2.7713568210601807, + "rewards/rejected": -2.124890089035034, "step": 870 }, { - "epoch": 0.91, - "learning_rate": 3.8729429774205893e-07, - "logits/chosen": -2.963660478591919, - "logits/rejected": -3.0102293491363525, - "logps/chosen": -366.26568603515625, - "logps/rejected": -289.0752258300781, - "loss": 0.5749, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.3400164246559143, - "rewards/margins": 1.4809852838516235, - "rewards/rejected": -1.140968918800354, + "epoch": 0.45, + "learning_rate": 4.7150506789061006e-07, + "logits/chosen": -2.966210126876831, + "logits/rejected": -2.979029655456543, + "logps/chosen": -266.33978271484375, + "logps/rejected": -354.4034729003906, + "loss": 0.5281, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.17260950803756714, + "rewards/margins": 2.235333204269409, + "rewards/rejected": -2.0627236366271973, "step": 880 }, { - "epoch": 0.92, - "learning_rate": 3.8538078836586294e-07, - "logits/chosen": -2.9732441902160645, - "logits/rejected": -2.97709321975708, - "logps/chosen": -321.77056884765625, - "logps/rejected": -294.64068603515625, - "loss": 0.5648, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.07467867434024811, - "rewards/margins": 1.4241504669189453, - "rewards/rejected": -1.3494718074798584, + "epoch": 0.46, + "learning_rate": 4.7054886211512717e-07, + "logits/chosen": -3.0034615993499756, + "logits/rejected": -3.0113863945007324, + "logps/chosen": -236.93765258789062, + "logps/rejected": -301.739013671875, + "loss": 0.4411, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5323739051818848, + "rewards/margins": 2.8146984577178955, + "rewards/rejected": -2.2823245525360107, "step": 890 }, { - "epoch": 0.93, - "learning_rate": 3.83467278989667e-07, - "logits/chosen": -2.9654107093811035, - "logits/rejected": -2.946199417114258, - "logps/chosen": -333.27496337890625, - "logps/rejected": -261.41510009765625, - "loss": 0.5293, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.17271609604358673, - "rewards/margins": 1.3218122720718384, - "rewards/rejected": -1.149096131324768, + "epoch": 0.46, + "learning_rate": 4.695926563396443e-07, + "logits/chosen": -3.032571315765381, + "logits/rejected": -3.023155927658081, + "logps/chosen": -244.9833221435547, + "logps/rejected": -273.37054443359375, + "loss": 0.5377, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09555510431528091, + "rewards/margins": 2.1211750507354736, + "rewards/rejected": -2.0256197452545166, "step": 900 }, { - "epoch": 0.93, - "eval_logits/chosen": -2.953744649887085, - "eval_logits/rejected": -2.946305990219116, - "eval_logps/chosen": -351.4819030761719, - "eval_logps/rejected": -297.2767639160156, - "eval_loss": 0.5146499872207642, - "eval_rewards/accuracies": 0.7620000243186951, - "eval_rewards/chosen": 0.13839714229106903, - "eval_rewards/margins": 1.4541445970535278, - "eval_rewards/rejected": -1.3157474994659424, - "eval_runtime": 500.2192, - "eval_samples_per_second": 3.998, - "eval_steps_per_second": 0.5, + "epoch": 0.46, + "eval_logits/chosen": -2.904477834701538, + "eval_logits/rejected": -2.9651331901550293, + "eval_logps/chosen": -242.6551513671875, + "eval_logps/rejected": -295.7052001953125, + "eval_loss": 0.4912913739681244, + "eval_rewards/accuracies": 0.7960000038146973, + "eval_rewards/chosen": 0.5024977922439575, + "eval_rewards/margins": 2.4592745304107666, + "eval_rewards/rejected": -1.9567766189575195, + "eval_runtime": 278.6054, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.449, "step": 900 }, { - "epoch": 0.94, - "learning_rate": 3.8155376961347106e-07, - "logits/chosen": -2.9613089561462402, - "logits/rejected": -2.9708240032196045, - "logps/chosen": -272.4183044433594, - "logps/rejected": -238.2782440185547, - "loss": 0.5416, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.04612383991479874, - "rewards/margins": 1.3310301303863525, - "rewards/rejected": -1.2849063873291016, + "epoch": 0.47, + "learning_rate": 4.686364505641614e-07, + "logits/chosen": -2.992729663848877, + "logits/rejected": -2.9883501529693604, + "logps/chosen": -262.54608154296875, + "logps/rejected": -270.30059814453125, + "loss": 0.4715, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4884259104728699, + "rewards/margins": 2.7323694229125977, + "rewards/rejected": -2.243943691253662, "step": 910 }, { - "epoch": 0.95, - "learning_rate": 3.796402602372751e-07, - "logits/chosen": -2.9403462409973145, - "logits/rejected": -2.9374024868011475, - "logps/chosen": -331.61199951171875, - "logps/rejected": -297.79296875, - "loss": 0.5002, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.15933088958263397, - "rewards/margins": 1.321645975112915, - "rewards/rejected": -1.162315011024475, + "epoch": 0.47, + "learning_rate": 4.676802447886785e-07, + "logits/chosen": -2.9018282890319824, + "logits/rejected": -2.9027373790740967, + "logps/chosen": -248.11807250976562, + "logps/rejected": -285.3119201660156, + "loss": 0.5509, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.48879727721214294, + "rewards/margins": 2.8756463527679443, + "rewards/rejected": -2.3868489265441895, "step": 920 }, { - "epoch": 0.96, - "learning_rate": 3.777267508610792e-07, - "logits/chosen": -2.8792290687561035, - "logits/rejected": -2.904510498046875, - "logps/chosen": -337.1361389160156, - "logps/rejected": -294.4916687011719, - "loss": 0.5735, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.10670886188745499, - "rewards/margins": 1.252929449081421, - "rewards/rejected": -1.3596383333206177, + "epoch": 0.48, + "learning_rate": 4.6672403901319564e-07, + "logits/chosen": -2.9280457496643066, + "logits/rejected": -2.954550266265869, + "logps/chosen": -214.4419403076172, + "logps/rejected": -278.14483642578125, + "loss": 0.3943, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.816567063331604, + "rewards/margins": 2.9894230365753174, + "rewards/rejected": -2.1728556156158447, "step": 930 }, { - "epoch": 0.97, - "learning_rate": 3.7581324148488325e-07, - "logits/chosen": -2.9526283740997314, - "logits/rejected": -2.936981201171875, - "logps/chosen": -384.3909606933594, - "logps/rejected": -281.7669677734375, - "loss": 0.5667, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.11869187653064728, - "rewards/margins": 1.0847665071487427, - "rewards/rejected": -1.203458309173584, + "epoch": 0.49, + "learning_rate": 4.6576783323771275e-07, + "logits/chosen": -2.8166980743408203, + "logits/rejected": -2.7993063926696777, + "logps/chosen": -204.55262756347656, + "logps/rejected": -259.32012939453125, + "loss": 0.5375, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.009540450759232044, + "rewards/margins": 2.455078601837158, + "rewards/rejected": -2.445538282394409, "step": 940 }, { - "epoch": 0.98, - "learning_rate": 3.738997321086873e-07, - "logits/chosen": -2.9670538902282715, - "logits/rejected": -2.9704997539520264, - "logps/chosen": -334.4848937988281, - "logps/rejected": -277.0538330078125, - "loss": 0.5199, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.020179124549031258, - "rewards/margins": 1.3067853450775146, - "rewards/rejected": -1.2866061925888062, + "epoch": 0.49, + "learning_rate": 4.6481162746222987e-07, + "logits/chosen": -2.8641529083251953, + "logits/rejected": -2.8614368438720703, + "logps/chosen": -309.6964111328125, + "logps/rejected": -332.9620666503906, + "loss": 0.4252, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.04014641046524048, + "rewards/margins": 3.2200770378112793, + "rewards/rejected": -3.260223388671875, "step": 950 }, { - "epoch": 0.99, - "learning_rate": 3.7198622273249137e-07, - "logits/chosen": -2.955815076828003, - "logits/rejected": -2.9407596588134766, - "logps/chosen": -342.2349548339844, - "logps/rejected": -290.681396484375, - "loss": 0.5393, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.006191739346832037, - "rewards/margins": 1.012204885482788, - "rewards/rejected": -1.0060131549835205, + "epoch": 0.5, + "learning_rate": 4.63855421686747e-07, + "logits/chosen": -2.892148494720459, + "logits/rejected": -2.906142473220825, + "logps/chosen": -258.5798645019531, + "logps/rejected": -293.4261474609375, + "loss": 0.4546, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.11958281695842743, + "rewards/margins": 2.9684510231018066, + "rewards/rejected": -2.8488681316375732, "step": 960 }, { - "epoch": 1.0, - "learning_rate": 3.7007271335629544e-07, - "logits/chosen": -2.9522531032562256, - "logits/rejected": -2.938239097595215, - "logps/chosen": -312.70745849609375, - "logps/rejected": -283.48223876953125, - "loss": 0.4885, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.21793344616889954, - "rewards/margins": 1.9288184642791748, - "rewards/rejected": -1.7108850479125977, + "epoch": 0.5, + "learning_rate": 4.628992159112641e-07, + "logits/chosen": -2.8312416076660156, + "logits/rejected": -2.909719228744507, + "logps/chosen": -231.47561645507812, + "logps/rejected": -321.9608459472656, + "loss": 0.4045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.29932117462158203, + "rewards/margins": 2.905813217163086, + "rewards/rejected": -2.606492042541504, "step": 970 }, { - "epoch": 1.01, - "learning_rate": 3.681592039800995e-07, - "logits/chosen": -2.943690776824951, - "logits/rejected": -2.9434661865234375, - "logps/chosen": -276.56475830078125, - "logps/rejected": -279.9871826171875, - "loss": 0.1281, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.4070587158203125, - "rewards/margins": 4.2252960205078125, - "rewards/rejected": -2.818237543106079, + "epoch": 0.51, + "learning_rate": 4.6194301013578116e-07, + "logits/chosen": -2.8688275814056396, + "logits/rejected": -2.9191553592681885, + "logps/chosen": -308.28192138671875, + "logps/rejected": -300.1478576660156, + "loss": 0.4955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015064060688018799, + "rewards/margins": 2.7704272270202637, + "rewards/rejected": -2.7854912281036377, "step": 980 }, { - "epoch": 1.02, - "learning_rate": 3.662456946039035e-07, - "logits/chosen": -2.945516586303711, - "logits/rejected": -2.937763214111328, - "logps/chosen": -294.52215576171875, - "logps/rejected": -295.2162170410156, - "loss": 0.1193, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 1.4316601753234863, - "rewards/margins": 4.2783918380737305, - "rewards/rejected": -2.846731185913086, + "epoch": 0.51, + "learning_rate": 4.609868043602983e-07, + "logits/chosen": -2.9763271808624268, + "logits/rejected": -2.945359706878662, + "logps/chosen": -205.1085968017578, + "logps/rejected": -291.16156005859375, + "loss": 0.3997, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.6670485734939575, + "rewards/margins": 3.19486927986145, + "rewards/rejected": -2.5278208255767822, "step": 990 }, { - "epoch": 1.03, - "learning_rate": 3.6433218522770757e-07, - "logits/chosen": -2.846494436264038, - "logits/rejected": -2.858985424041748, - "logps/chosen": -310.45452880859375, - "logps/rejected": -307.7423400878906, - "loss": 0.1078, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.7381668090820312, - "rewards/margins": 4.478921890258789, - "rewards/rejected": -2.740755081176758, + "epoch": 0.52, + "learning_rate": 4.600305985848154e-07, + "logits/chosen": -2.8994736671447754, + "logits/rejected": -2.9172794818878174, + "logps/chosen": -231.45339965820312, + "logps/rejected": -294.75274658203125, + "loss": 0.4886, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15061122179031372, + "rewards/margins": 2.3647937774658203, + "rewards/rejected": -2.2141823768615723, "step": 1000 }, { - "epoch": 1.03, - "eval_logits/chosen": -2.9057369232177734, - "eval_logits/rejected": -2.8941798210144043, - "eval_logps/chosen": -349.8238220214844, - "eval_logps/rejected": -300.6510314941406, - "eval_loss": 0.5121060609817505, - "eval_rewards/accuracies": 0.777999997138977, - "eval_rewards/chosen": 0.30420342087745667, - "eval_rewards/margins": 1.9573808908462524, - "eval_rewards/rejected": -1.6531774997711182, - "eval_runtime": 499.8834, - "eval_samples_per_second": 4.001, - "eval_steps_per_second": 0.5, + "epoch": 0.52, + "eval_logits/chosen": -2.8935189247131348, + "eval_logits/rejected": -2.9735095500946045, + "eval_logps/chosen": -246.81280517578125, + "eval_logps/rejected": -304.04638671875, + "eval_loss": 0.44952473044395447, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": 0.0867358073592186, + "eval_rewards/margins": 2.877634286880493, + "eval_rewards/rejected": -2.790898084640503, + "eval_runtime": 278.697, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.449, "step": 1000 }, { - "epoch": 1.04, - "learning_rate": 3.6241867585151163e-07, - "logits/chosen": -2.9094786643981934, - "logits/rejected": -2.919285535812378, - "logps/chosen": -305.79949951171875, - "logps/rejected": -297.85455322265625, - "loss": 0.108, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.7003862857818604, - "rewards/margins": 4.974067687988281, - "rewards/rejected": -3.273681640625, + "epoch": 0.52, + "learning_rate": 4.590743928093325e-07, + "logits/chosen": -2.7804977893829346, + "logits/rejected": -2.7935779094696045, + "logps/chosen": -289.16845703125, + "logps/rejected": -317.79296875, + "loss": 0.4715, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.14265815913677216, + "rewards/margins": 2.7420856952667236, + "rewards/rejected": -2.5994274616241455, "step": 1010 }, { - "epoch": 1.05, - "learning_rate": 3.605051664753157e-07, - "logits/chosen": -2.880923271179199, - "logits/rejected": -2.887871742248535, - "logps/chosen": -312.16107177734375, - "logps/rejected": -289.467041015625, - "loss": 0.1136, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.358328104019165, - "rewards/margins": 4.708084583282471, - "rewards/rejected": -3.3497557640075684, + "epoch": 0.53, + "learning_rate": 4.581181870338497e-07, + "logits/chosen": -2.856433391571045, + "logits/rejected": -2.861987590789795, + "logps/chosen": -326.0638732910156, + "logps/rejected": -347.38494873046875, + "loss": 0.375, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2731901705265045, + "rewards/margins": 3.395005702972412, + "rewards/rejected": -3.1218154430389404, "step": 1020 }, { - "epoch": 1.06, - "learning_rate": 3.5859165709911975e-07, - "logits/chosen": -2.883192539215088, - "logits/rejected": -2.8778469562530518, - "logps/chosen": -327.26788330078125, - "logps/rejected": -333.3467712402344, - "loss": 0.0861, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5257574319839478, - "rewards/margins": 4.850948333740234, - "rewards/rejected": -3.325191020965576, + "epoch": 0.53, + "learning_rate": 4.571619812583668e-07, + "logits/chosen": -2.915353536605835, + "logits/rejected": -2.9367640018463135, + "logps/chosen": -259.37750244140625, + "logps/rejected": -325.2166442871094, + "loss": 0.4497, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03804367035627365, + "rewards/margins": 2.7071585655212402, + "rewards/rejected": -2.6691150665283203, "step": 1030 }, { - "epoch": 1.07, - "learning_rate": 3.566781477229238e-07, - "logits/chosen": -2.946524143218994, - "logits/rejected": -2.9164326190948486, - "logps/chosen": -303.1543884277344, - "logps/rejected": -342.10772705078125, - "loss": 0.1002, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.4854071140289307, - "rewards/margins": 5.697620391845703, - "rewards/rejected": -4.212213516235352, + "epoch": 0.54, + "learning_rate": 4.562057754828839e-07, + "logits/chosen": -2.8658149242401123, + "logits/rejected": -2.855698823928833, + "logps/chosen": -284.882568359375, + "logps/rejected": -290.2754821777344, + "loss": 0.5011, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2647832930088043, + "rewards/margins": 3.2260468006134033, + "rewards/rejected": -2.9612631797790527, "step": 1040 }, { - "epoch": 1.08, - "learning_rate": 3.547646383467279e-07, - "logits/chosen": -2.891878604888916, - "logits/rejected": -2.8879518508911133, - "logps/chosen": -317.4754943847656, - "logps/rejected": -299.55224609375, - "loss": 0.1082, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 1.3902817964553833, - "rewards/margins": 5.116191387176514, - "rewards/rejected": -3.725910186767578, + "epoch": 0.54, + "learning_rate": 4.55249569707401e-07, + "logits/chosen": -2.91452956199646, + "logits/rejected": -2.936554431915283, + "logps/chosen": -212.68734741210938, + "logps/rejected": -293.65185546875, + "loss": 0.4751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3139684796333313, + "rewards/margins": 2.481248617172241, + "rewards/rejected": -2.7952170372009277, "step": 1050 }, { - "epoch": 1.09, - "learning_rate": 3.5285112897053194e-07, - "logits/chosen": -2.9003710746765137, - "logits/rejected": -2.902200222015381, - "logps/chosen": -315.47528076171875, - "logps/rejected": -322.63446044921875, - "loss": 0.0864, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4996274709701538, - "rewards/margins": 5.4629316329956055, - "rewards/rejected": -3.963304042816162, + "epoch": 0.55, + "learning_rate": 4.5429336393191814e-07, + "logits/chosen": -2.83843994140625, + "logits/rejected": -2.899430751800537, + "logps/chosen": -221.84915161132812, + "logps/rejected": -261.29046630859375, + "loss": 0.6509, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.24030618369579315, + "rewards/margins": 2.0365347862243652, + "rewards/rejected": -1.7962287664413452, "step": 1060 }, { - "epoch": 1.11, - "learning_rate": 3.50937619594336e-07, - "logits/chosen": -2.9368162155151367, - "logits/rejected": -2.9310317039489746, - "logps/chosen": -323.0782165527344, - "logps/rejected": -339.6298828125, - "loss": 0.113, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.027733564376831, - "rewards/margins": 4.9948883056640625, - "rewards/rejected": -3.9671554565429688, + "epoch": 0.55, + "learning_rate": 4.5333715815643525e-07, + "logits/chosen": -2.9258434772491455, + "logits/rejected": -2.9901933670043945, + "logps/chosen": -266.33856201171875, + "logps/rejected": -313.81488037109375, + "loss": 0.5163, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.2790577709674835, + "rewards/margins": 2.187150478363037, + "rewards/rejected": -2.4662084579467773, "step": 1070 }, { - "epoch": 1.12, - "learning_rate": 3.4902411021814007e-07, - "logits/chosen": -2.9368138313293457, - "logits/rejected": -2.9042770862579346, - "logps/chosen": -353.8468017578125, - "logps/rejected": -307.30560302734375, - "loss": 0.1081, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.38088858127594, - "rewards/margins": 5.396637439727783, - "rewards/rejected": -4.015748500823975, + "epoch": 0.56, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": -2.9200329780578613, + "logits/rejected": -2.912764310836792, + "logps/chosen": -227.3638916015625, + "logps/rejected": -279.1597595214844, + "loss": 0.4705, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.6271113157272339, + "rewards/margins": 2.5784404277801514, + "rewards/rejected": -1.951329231262207, "step": 1080 }, { - "epoch": 1.13, - "learning_rate": 3.4711060084194413e-07, - "logits/chosen": -2.900344133377075, - "logits/rejected": -2.9066405296325684, - "logps/chosen": -332.6580505371094, - "logps/rejected": -306.313232421875, - "loss": 0.1185, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.6640781164169312, - "rewards/margins": 4.935359477996826, - "rewards/rejected": -3.2712814807891846, + "epoch": 0.56, + "learning_rate": 4.514247466054695e-07, + "logits/chosen": -2.837622880935669, + "logits/rejected": -2.9189155101776123, + "logps/chosen": -200.35733032226562, + "logps/rejected": -259.71942138671875, + "loss": 0.4103, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.44330382347106934, + "rewards/margins": 2.879500150680542, + "rewards/rejected": -2.4361963272094727, "step": 1090 }, { - "epoch": 1.14, - "learning_rate": 3.4519709146574814e-07, - "logits/chosen": -2.927727460861206, - "logits/rejected": -2.8953537940979004, - "logps/chosen": -267.3210144042969, - "logps/rejected": -272.5216979980469, - "loss": 0.0928, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3533536195755005, - "rewards/margins": 5.032728672027588, - "rewards/rejected": -3.6793746948242188, + "epoch": 0.57, + "learning_rate": 4.504685408299866e-07, + "logits/chosen": -2.7896180152893066, + "logits/rejected": -2.818835496902466, + "logps/chosen": -284.47930908203125, + "logps/rejected": -365.02630615234375, + "loss": 0.4447, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.15361805260181427, + "rewards/margins": 2.5342843532562256, + "rewards/rejected": -2.3806662559509277, "step": 1100 }, { - "epoch": 1.14, - "eval_logits/chosen": -2.90618896484375, - "eval_logits/rejected": -2.891117811203003, - "eval_logps/chosen": -352.6541442871094, - "eval_logps/rejected": -305.599853515625, - "eval_loss": 0.5322153568267822, - "eval_rewards/accuracies": 0.777999997138977, - "eval_rewards/chosen": 0.021169064566493034, - "eval_rewards/margins": 2.169229030609131, - "eval_rewards/rejected": -2.148059844970703, - "eval_runtime": 500.2131, - "eval_samples_per_second": 3.998, - "eval_steps_per_second": 0.5, + "epoch": 0.57, + "eval_logits/chosen": -2.7942800521850586, + "eval_logits/rejected": -2.870736837387085, + "eval_logps/chosen": -244.3843994140625, + "eval_logps/rejected": -300.1572570800781, + "eval_loss": 0.4398292303085327, + "eval_rewards/accuracies": 0.8100000023841858, + "eval_rewards/chosen": 0.3295760750770569, + "eval_rewards/margins": 2.731562614440918, + "eval_rewards/rejected": -2.4019863605499268, + "eval_runtime": 278.8945, + "eval_samples_per_second": 7.171, + "eval_steps_per_second": 0.448, "step": 1100 }, { - "epoch": 1.15, - "learning_rate": 3.432835820895522e-07, - "logits/chosen": -2.8970720767974854, - "logits/rejected": -2.8721566200256348, - "logps/chosen": -315.86383056640625, - "logps/rejected": -312.5824890136719, - "loss": 0.0952, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2646430730819702, - "rewards/margins": 5.519986152648926, - "rewards/rejected": -4.255342483520508, + "epoch": 0.57, + "learning_rate": 4.495123350545037e-07, + "logits/chosen": -2.8621342182159424, + "logits/rejected": -2.8880348205566406, + "logps/chosen": -304.4235534667969, + "logps/rejected": -333.8514709472656, + "loss": 0.4558, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11846466362476349, + "rewards/margins": 2.162449359893799, + "rewards/rejected": -2.280913829803467, "step": 1110 }, { - "epoch": 1.16, - "learning_rate": 3.4137007271335626e-07, - "logits/chosen": -2.8670730590820312, - "logits/rejected": -2.8753676414489746, - "logps/chosen": -281.70355224609375, - "logps/rejected": -292.56365966796875, - "loss": 0.1105, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9823983907699585, - "rewards/margins": 4.9709882736206055, - "rewards/rejected": -3.9885895252227783, + "epoch": 0.58, + "learning_rate": 4.4855612927902083e-07, + "logits/chosen": -2.8462681770324707, + "logits/rejected": -2.8501837253570557, + "logps/chosen": -270.5716552734375, + "logps/rejected": -292.13238525390625, + "loss": 0.5499, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.028979312628507614, + "rewards/margins": 2.364931583404541, + "rewards/rejected": -2.3359522819519043, "step": 1120 }, { - "epoch": 1.17, - "learning_rate": 3.394565633371603e-07, - "logits/chosen": -2.9044623374938965, - "logits/rejected": -2.8968944549560547, - "logps/chosen": -370.77630615234375, - "logps/rejected": -357.85198974609375, - "loss": 0.1185, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 2.036125659942627, - "rewards/margins": 6.214476108551025, - "rewards/rejected": -4.17834997177124, + "epoch": 0.58, + "learning_rate": 4.4759992350353795e-07, + "logits/chosen": -2.821378231048584, + "logits/rejected": -2.7994885444641113, + "logps/chosen": -229.1116943359375, + "logps/rejected": -278.6077880859375, + "loss": 0.4073, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4151396155357361, + "rewards/margins": 3.402142286300659, + "rewards/rejected": -2.9870028495788574, "step": 1130 }, { - "epoch": 1.18, - "learning_rate": 3.375430539609644e-07, - "logits/chosen": -2.8854241371154785, - "logits/rejected": -2.897651433944702, - "logps/chosen": -290.554443359375, - "logps/rejected": -339.0045166015625, - "loss": 0.0878, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8847514390945435, - "rewards/margins": 5.345077991485596, - "rewards/rejected": -4.460326671600342, + "epoch": 0.59, + "learning_rate": 4.46643717728055e-07, + "logits/chosen": -2.7817282676696777, + "logits/rejected": -2.7494282722473145, + "logps/chosen": -238.2794647216797, + "logps/rejected": -298.65216064453125, + "loss": 0.4258, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0842290148139, + "rewards/margins": 2.6528801918029785, + "rewards/rejected": -2.568650960922241, "step": 1140 }, { - "epoch": 1.19, - "learning_rate": 3.3562954458476845e-07, - "logits/chosen": -2.916201114654541, - "logits/rejected": -2.897951364517212, - "logps/chosen": -355.0877380371094, - "logps/rejected": -328.5706481933594, - "loss": 0.0898, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.6507154703140259, - "rewards/margins": 6.26724910736084, - "rewards/rejected": -4.616534233093262, + "epoch": 0.59, + "learning_rate": 4.4568751195257213e-07, + "logits/chosen": -2.7463810443878174, + "logits/rejected": -2.749958038330078, + "logps/chosen": -273.40826416015625, + "logps/rejected": -321.42889404296875, + "loss": 0.4437, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.8225911855697632, + "rewards/margins": 3.9052181243896484, + "rewards/rejected": -3.0826268196105957, "step": 1150 }, { - "epoch": 1.2, - "learning_rate": 3.337160352085725e-07, - "logits/chosen": -2.8670494556427, - "logits/rejected": -2.904618740081787, - "logps/chosen": -328.02496337890625, - "logps/rejected": -328.0072326660156, - "loss": 0.0763, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.1252260208129883, - "rewards/margins": 5.925239562988281, - "rewards/rejected": -4.800013542175293, + "epoch": 0.6, + "learning_rate": 4.447313061770893e-07, + "logits/chosen": -2.7460649013519287, + "logits/rejected": -2.712463617324829, + "logps/chosen": -253.2078094482422, + "logps/rejected": -373.8803405761719, + "loss": 0.4737, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.4817771315574646, + "rewards/margins": 3.4784576892852783, + "rewards/rejected": -2.996680736541748, "step": 1160 }, { - "epoch": 1.21, - "learning_rate": 3.3180252583237657e-07, - "logits/chosen": -2.9289798736572266, - "logits/rejected": -2.929511547088623, - "logps/chosen": -325.98822021484375, - "logps/rejected": -297.0420837402344, - "loss": 0.0937, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.4292241334915161, - "rewards/margins": 6.024569988250732, - "rewards/rejected": -4.595345973968506, + "epoch": 0.6, + "learning_rate": 4.437751004016064e-07, + "logits/chosen": -2.683619976043701, + "logits/rejected": -2.715719699859619, + "logps/chosen": -225.0996856689453, + "logps/rejected": -282.35101318359375, + "loss": 0.5286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4098489284515381, + "rewards/margins": 2.038952589035034, + "rewards/rejected": -2.448801279067993, "step": 1170 }, { - "epoch": 1.22, - "learning_rate": 3.2988901645618063e-07, - "logits/chosen": -2.879714012145996, - "logits/rejected": -2.850334644317627, - "logps/chosen": -335.61297607421875, - "logps/rejected": -316.1272888183594, - "loss": 0.0961, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.421682357788086, - "rewards/margins": 5.595412254333496, - "rewards/rejected": -4.173730373382568, + "epoch": 0.61, + "learning_rate": 4.4281889462612353e-07, + "logits/chosen": -2.783116340637207, + "logits/rejected": -2.8799386024475098, + "logps/chosen": -219.9791717529297, + "logps/rejected": -270.49774169921875, + "loss": 0.4362, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3964368999004364, + "rewards/margins": 3.005711078643799, + "rewards/rejected": -2.60927414894104, "step": 1180 }, { - "epoch": 1.23, - "learning_rate": 3.279755070799847e-07, - "logits/chosen": -2.9137988090515137, - "logits/rejected": -2.921215057373047, - "logps/chosen": -299.4486999511719, - "logps/rejected": -330.41265869140625, - "loss": 0.0991, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.5697805881500244, - "rewards/margins": 5.397409439086914, - "rewards/rejected": -3.8276290893554688, + "epoch": 0.61, + "learning_rate": 4.4186268885064064e-07, + "logits/chosen": -2.8068182468414307, + "logits/rejected": -2.8661465644836426, + "logps/chosen": -277.33917236328125, + "logps/rejected": -300.82470703125, + "loss": 0.5316, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07599838078022003, + "rewards/margins": 2.2254130840301514, + "rewards/rejected": -2.149414539337158, "step": 1190 }, { - "epoch": 1.24, - "learning_rate": 3.260619977037887e-07, - "logits/chosen": -2.8730692863464355, - "logits/rejected": -2.890625476837158, - "logps/chosen": -322.72906494140625, - "logps/rejected": -322.37774658203125, - "loss": 0.1295, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4574929475784302, - "rewards/margins": 5.438672065734863, - "rewards/rejected": -3.9811782836914062, + "epoch": 0.62, + "learning_rate": 4.4090648307515776e-07, + "logits/chosen": -2.8061962127685547, + "logits/rejected": -2.864501476287842, + "logps/chosen": -193.76907348632812, + "logps/rejected": -220.1031036376953, + "loss": 0.4971, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.01744268834590912, + "rewards/margins": 2.256462335586548, + "rewards/rejected": -2.273904800415039, "step": 1200 }, { - "epoch": 1.24, - "eval_logits/chosen": -2.9265103340148926, - "eval_logits/rejected": -2.919196844100952, - "eval_logps/chosen": -354.89837646484375, - "eval_logps/rejected": -307.5298156738281, - "eval_loss": 0.5383667945861816, - "eval_rewards/accuracies": 0.7760000228881836, - "eval_rewards/chosen": -0.20325522124767303, - "eval_rewards/margins": 2.137800693511963, - "eval_rewards/rejected": -2.3410558700561523, - "eval_runtime": 500.2164, - "eval_samples_per_second": 3.998, - "eval_steps_per_second": 0.5, + "epoch": 0.62, + "eval_logits/chosen": -2.782477855682373, + "eval_logits/rejected": -2.8602378368377686, + "eval_logps/chosen": -242.60581970214844, + "eval_logps/rejected": -298.29931640625, + "eval_loss": 0.44120046496391296, + "eval_rewards/accuracies": 0.7940000295639038, + "eval_rewards/chosen": 0.507435142993927, + "eval_rewards/margins": 2.723625898361206, + "eval_rewards/rejected": -2.2161905765533447, + "eval_runtime": 278.8134, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 0.448, "step": 1200 }, { - "epoch": 1.25, - "learning_rate": 3.2414848832759277e-07, - "logits/chosen": -2.9244723320007324, - "logits/rejected": -2.8997814655303955, - "logps/chosen": -300.6869201660156, - "logps/rejected": -309.40478515625, - "loss": 0.0912, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.3476300239562988, - "rewards/margins": 5.487320899963379, - "rewards/rejected": -4.139689922332764, + "epoch": 0.62, + "learning_rate": 4.399502772996749e-07, + "logits/chosen": -2.8208165168762207, + "logits/rejected": -2.87434720993042, + "logps/chosen": -228.93905639648438, + "logps/rejected": -299.3986511230469, + "loss": 0.4816, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.21798264980316162, + "rewards/margins": 2.769120693206787, + "rewards/rejected": -2.5511374473571777, "step": 1210 }, { - "epoch": 1.26, - "learning_rate": 3.2223497895139683e-07, - "logits/chosen": -2.9218931198120117, - "logits/rejected": -2.923119068145752, - "logps/chosen": -338.5274353027344, - "logps/rejected": -345.64019775390625, - "loss": 0.1366, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.7827097177505493, - "rewards/margins": 5.954866409301758, - "rewards/rejected": -4.172156810760498, + "epoch": 0.63, + "learning_rate": 4.38994071524192e-07, + "logits/chosen": -2.924377918243408, + "logits/rejected": -2.951292037963867, + "logps/chosen": -258.6681823730469, + "logps/rejected": -258.79351806640625, + "loss": 0.4607, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.15325573086738586, + "rewards/margins": 2.3194260597229004, + "rewards/rejected": -2.1661698818206787, "step": 1220 }, { - "epoch": 1.27, - "learning_rate": 3.203214695752009e-07, - "logits/chosen": -2.9061856269836426, - "logits/rejected": -2.9157071113586426, - "logps/chosen": -322.0141296386719, - "logps/rejected": -351.03692626953125, - "loss": 0.1512, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.399315595626831, - "rewards/margins": 5.718806266784668, - "rewards/rejected": -4.319490909576416, + "epoch": 0.64, + "learning_rate": 4.380378657487091e-07, + "logits/chosen": -2.915999174118042, + "logits/rejected": -2.9771485328674316, + "logps/chosen": -262.7147521972656, + "logps/rejected": -365.8441162109375, + "loss": 0.5435, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.577280580997467, + "rewards/margins": 3.5463051795959473, + "rewards/rejected": -2.969024419784546, "step": 1230 }, { - "epoch": 1.28, - "learning_rate": 3.1840796019900495e-07, - "logits/chosen": -2.8873629570007324, - "logits/rejected": -2.9147911071777344, - "logps/chosen": -346.08380126953125, - "logps/rejected": -356.04461669921875, - "loss": 0.1031, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.514539122581482, - "rewards/margins": 5.836313724517822, - "rewards/rejected": -4.321774482727051, + "epoch": 0.64, + "learning_rate": 4.370816599732262e-07, + "logits/chosen": -2.927295207977295, + "logits/rejected": -3.0081796646118164, + "logps/chosen": -247.39413452148438, + "logps/rejected": -281.930908203125, + "loss": 0.4259, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.4076114594936371, + "rewards/margins": 3.5351879596710205, + "rewards/rejected": -3.1275763511657715, "step": 1240 }, { - "epoch": 1.29, - "learning_rate": 3.16494450822809e-07, - "logits/chosen": -2.8592748641967773, - "logits/rejected": -2.854814052581787, - "logps/chosen": -294.95758056640625, - "logps/rejected": -280.81866455078125, - "loss": 0.0988, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9549561738967896, - "rewards/margins": 4.784027099609375, - "rewards/rejected": -3.829070568084717, + "epoch": 0.65, + "learning_rate": 4.3612545419774334e-07, + "logits/chosen": -2.8849966526031494, + "logits/rejected": -2.914768695831299, + "logps/chosen": -219.829345703125, + "logps/rejected": -322.58203125, + "loss": 0.5023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16795411705970764, + "rewards/margins": 2.0571203231811523, + "rewards/rejected": -2.225074291229248, "step": 1250 }, { - "epoch": 1.3, - "learning_rate": 3.145809414466131e-07, - "logits/chosen": -2.8863461017608643, - "logits/rejected": -2.870108127593994, - "logps/chosen": -314.99798583984375, - "logps/rejected": -317.19476318359375, - "loss": 0.0949, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.104218602180481, - "rewards/margins": 5.444802284240723, - "rewards/rejected": -4.340583801269531, + "epoch": 0.65, + "learning_rate": 4.3516924842226045e-07, + "logits/chosen": -2.861953020095825, + "logits/rejected": -2.931356430053711, + "logps/chosen": -271.10101318359375, + "logps/rejected": -307.50628662109375, + "loss": 0.4281, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.07305797189474106, + "rewards/margins": 2.9939346313476562, + "rewards/rejected": -2.9208762645721436, "step": 1260 }, { - "epoch": 1.31, - "learning_rate": 3.1266743207041714e-07, - "logits/chosen": -2.835925817489624, - "logits/rejected": -2.8321166038513184, - "logps/chosen": -360.5287170410156, - "logps/rejected": -305.8837890625, - "loss": 0.1046, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5260238647460938, - "rewards/margins": 5.6372551918029785, - "rewards/rejected": -4.111231327056885, + "epoch": 0.66, + "learning_rate": 4.3421304264677757e-07, + "logits/chosen": -2.904646396636963, + "logits/rejected": -2.9532904624938965, + "logps/chosen": -249.11837768554688, + "logps/rejected": -269.0419921875, + "loss": 0.393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16748471558094025, + "rewards/margins": 2.746199131011963, + "rewards/rejected": -2.578714370727539, "step": 1270 }, { - "epoch": 1.32, - "learning_rate": 3.107539226942212e-07, - "logits/chosen": -2.8840396404266357, - "logits/rejected": -2.902611017227173, - "logps/chosen": -306.69293212890625, - "logps/rejected": -348.4713439941406, - "loss": 0.1141, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5585787296295166, - "rewards/margins": 6.097206115722656, - "rewards/rejected": -4.5386271476745605, + "epoch": 0.66, + "learning_rate": 4.332568368712947e-07, + "logits/chosen": -2.971287250518799, + "logits/rejected": -3.0082404613494873, + "logps/chosen": -250.14260864257812, + "logps/rejected": -292.34637451171875, + "loss": 0.4698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24356532096862793, + "rewards/margins": 2.4449563026428223, + "rewards/rejected": -2.6885218620300293, "step": 1280 }, { - "epoch": 1.33, - "learning_rate": 3.0884041331802526e-07, - "logits/chosen": -2.8800981044769287, - "logits/rejected": -2.886286973953247, - "logps/chosen": -299.43328857421875, - "logps/rejected": -306.6632385253906, - "loss": 0.1164, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0270495414733887, - "rewards/margins": 5.2709574699401855, - "rewards/rejected": -4.243908405303955, + "epoch": 0.67, + "learning_rate": 4.323006310958118e-07, + "logits/chosen": -2.863560676574707, + "logits/rejected": -2.9776759147644043, + "logps/chosen": -264.33233642578125, + "logps/rejected": -316.8869323730469, + "loss": 0.4937, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3103228807449341, + "rewards/margins": 3.370903491973877, + "rewards/rejected": -3.0605804920196533, "step": 1290 }, { - "epoch": 1.34, - "learning_rate": 3.0692690394182927e-07, - "logits/chosen": -2.9457216262817383, - "logits/rejected": -2.9319682121276855, - "logps/chosen": -326.23724365234375, - "logps/rejected": -328.8221130371094, - "loss": 0.1093, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.3889268636703491, - "rewards/margins": 5.26351261138916, - "rewards/rejected": -3.874584913253784, + "epoch": 0.67, + "learning_rate": 4.313444253203289e-07, + "logits/chosen": -2.913517713546753, + "logits/rejected": -2.9551703929901123, + "logps/chosen": -270.5498962402344, + "logps/rejected": -278.1402282714844, + "loss": 0.5218, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.27814558148384094, + "rewards/margins": 2.525500535964966, + "rewards/rejected": -2.2473549842834473, "step": 1300 }, { - "epoch": 1.34, - "eval_logits/chosen": -2.880734920501709, - "eval_logits/rejected": -2.870875597000122, - "eval_logps/chosen": -355.20660400390625, - "eval_logps/rejected": -309.2440490722656, - "eval_loss": 0.5469160079956055, - "eval_rewards/accuracies": 0.7860000133514404, - "eval_rewards/chosen": -0.23407350480556488, - "eval_rewards/margins": 2.2784061431884766, - "eval_rewards/rejected": -2.512479782104492, - "eval_runtime": 499.7502, - "eval_samples_per_second": 4.002, - "eval_steps_per_second": 0.5, + "epoch": 0.67, + "eval_logits/chosen": -2.886589288711548, + "eval_logits/rejected": -2.9536550045013428, + "eval_logps/chosen": -242.95413208007812, + "eval_logps/rejected": -299.2200622558594, + "eval_loss": 0.49863824248313904, + "eval_rewards/accuracies": 0.7960000038146973, + "eval_rewards/chosen": 0.4726015031337738, + "eval_rewards/margins": 2.7808680534362793, + "eval_rewards/rejected": -2.3082668781280518, + "eval_runtime": 278.6877, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.449, "step": 1300 }, { - "epoch": 1.35, - "learning_rate": 3.0501339456563334e-07, - "logits/chosen": -2.8287034034729004, - "logits/rejected": -2.807685613632202, - "logps/chosen": -320.2387390136719, - "logps/rejected": -309.42572021484375, - "loss": 0.1098, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3659682273864746, - "rewards/margins": 5.727939128875732, - "rewards/rejected": -4.3619704246521, + "epoch": 0.68, + "learning_rate": 4.3038821954484603e-07, + "logits/chosen": -2.930354356765747, + "logits/rejected": -2.9470126628875732, + "logps/chosen": -264.21075439453125, + "logps/rejected": -299.3939514160156, + "loss": 0.434, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.06842346489429474, + "rewards/margins": 2.0977582931518555, + "rewards/rejected": -2.029334545135498, "step": 1310 }, { - "epoch": 1.36, - "learning_rate": 3.030998851894374e-07, - "logits/chosen": -2.8757596015930176, - "logits/rejected": -2.8729655742645264, - "logps/chosen": -276.88128662109375, - "logps/rejected": -288.6640625, - "loss": 0.1207, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6556949615478516, - "rewards/margins": 4.997775077819824, - "rewards/rejected": -4.342080116271973, + "epoch": 0.68, + "learning_rate": 4.2943201376936315e-07, + "logits/chosen": -2.879948377609253, + "logits/rejected": -2.91914439201355, + "logps/chosen": -240.50375366210938, + "logps/rejected": -293.88970947265625, + "loss": 0.4489, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.028376024216413498, + "rewards/margins": 2.692608594894409, + "rewards/rejected": -2.6642327308654785, "step": 1320 }, { - "epoch": 1.37, - "learning_rate": 3.0118637581324146e-07, - "logits/chosen": -2.8695642948150635, - "logits/rejected": -2.8470568656921387, - "logps/chosen": -314.7218017578125, - "logps/rejected": -314.4281921386719, - "loss": 0.1229, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0547069311141968, - "rewards/margins": 6.196503162384033, - "rewards/rejected": -5.141795635223389, + "epoch": 0.69, + "learning_rate": 4.2847580799388026e-07, + "logits/chosen": -2.8353028297424316, + "logits/rejected": -2.896289348602295, + "logps/chosen": -263.8857421875, + "logps/rejected": -333.9789123535156, + "loss": 0.4939, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.7538554072380066, + "rewards/margins": 2.6621108055114746, + "rewards/rejected": -1.9082553386688232, "step": 1330 }, { - "epoch": 1.38, - "learning_rate": 2.992728664370455e-07, - "logits/chosen": -2.832125186920166, - "logits/rejected": -2.862847328186035, - "logps/chosen": -275.20269775390625, - "logps/rejected": -299.24664306640625, - "loss": 0.1286, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 0.4705425202846527, - "rewards/margins": 5.204878807067871, - "rewards/rejected": -4.734335899353027, + "epoch": 0.69, + "learning_rate": 4.275196022183974e-07, + "logits/chosen": -2.7950098514556885, + "logits/rejected": -2.842745780944824, + "logps/chosen": -229.3527069091797, + "logps/rejected": -267.72998046875, + "loss": 0.5018, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5895912051200867, + "rewards/margins": 2.599395513534546, + "rewards/rejected": -2.0098042488098145, "step": 1340 }, { - "epoch": 1.39, - "learning_rate": 2.973593570608496e-07, - "logits/chosen": -2.8754680156707764, - "logits/rejected": -2.8437042236328125, - "logps/chosen": -312.1493835449219, - "logps/rejected": -306.14056396484375, - "loss": 0.1128, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 0.5046719908714294, - "rewards/margins": 4.981934070587158, - "rewards/rejected": -4.477262496948242, + "epoch": 0.7, + "learning_rate": 4.265633964429145e-07, + "logits/chosen": -2.8519222736358643, + "logits/rejected": -2.9202880859375, + "logps/chosen": -193.359375, + "logps/rejected": -252.4734344482422, + "loss": 0.5209, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6954705119132996, + "rewards/margins": 2.9130749702453613, + "rewards/rejected": -2.217604398727417, "step": 1350 }, { - "epoch": 1.4, - "learning_rate": 2.9544584768465365e-07, - "logits/chosen": -2.907262086868286, - "logits/rejected": -2.8830528259277344, - "logps/chosen": -334.17474365234375, - "logps/rejected": -328.50006103515625, - "loss": 0.1181, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6233978271484375, - "rewards/margins": 4.791070461273193, - "rewards/rejected": -4.167672634124756, + "epoch": 0.7, + "learning_rate": 4.256071906674316e-07, + "logits/chosen": -2.8548550605773926, + "logits/rejected": -2.936178684234619, + "logps/chosen": -321.51763916015625, + "logps/rejected": -315.2483825683594, + "loss": 0.563, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.30997490882873535, + "rewards/margins": 2.0616490840911865, + "rewards/rejected": -1.7516740560531616, "step": 1360 }, { - "epoch": 1.41, - "learning_rate": 2.935323383084577e-07, - "logits/chosen": -2.8675036430358887, - "logits/rejected": -2.826355457305908, - "logps/chosen": -351.460205078125, - "logps/rejected": -333.004150390625, - "loss": 0.1086, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3194541931152344, - "rewards/margins": 6.369981288909912, - "rewards/rejected": -5.050527095794678, + "epoch": 0.71, + "learning_rate": 4.246509848919487e-07, + "logits/chosen": -2.912834882736206, + "logits/rejected": -3.0000414848327637, + "logps/chosen": -203.55709838867188, + "logps/rejected": -300.4374084472656, + "loss": 0.5563, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.46916669607162476, + "rewards/margins": 2.8837640285491943, + "rewards/rejected": -2.414597511291504, "step": 1370 }, { - "epoch": 1.43, - "learning_rate": 2.9161882893226177e-07, - "logits/chosen": -2.929363250732422, - "logits/rejected": -2.8654415607452393, - "logps/chosen": -325.78826904296875, - "logps/rejected": -314.6927490234375, - "loss": 0.1175, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.906810462474823, - "rewards/margins": 5.147686004638672, - "rewards/rejected": -4.240875720977783, + "epoch": 0.71, + "learning_rate": 4.2369477911646584e-07, + "logits/chosen": -2.943681478500366, + "logits/rejected": -3.013051986694336, + "logps/chosen": -218.1947784423828, + "logps/rejected": -263.2126159667969, + "loss": 0.4797, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.47852277755737305, + "rewards/margins": 2.767585277557373, + "rewards/rejected": -2.289062261581421, "step": 1380 }, { - "epoch": 1.44, - "learning_rate": 2.8970531955606583e-07, - "logits/chosen": -2.9362032413482666, - "logits/rejected": -2.911289930343628, - "logps/chosen": -345.9186706542969, - "logps/rejected": -316.5303955078125, - "loss": 0.1283, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9989246129989624, - "rewards/margins": 5.8463263511657715, - "rewards/rejected": -4.847402095794678, + "epoch": 0.72, + "learning_rate": 4.2273857334098296e-07, + "logits/chosen": -2.8952932357788086, + "logits/rejected": -2.9491114616394043, + "logps/chosen": -295.91815185546875, + "logps/rejected": -293.3535461425781, + "loss": 0.5053, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.44421887397766113, + "rewards/margins": 2.2546558380126953, + "rewards/rejected": -1.8104368448257446, "step": 1390 }, { - "epoch": 1.45, - "learning_rate": 2.8779181017986984e-07, - "logits/chosen": -2.9036128520965576, - "logits/rejected": -2.927844285964966, - "logps/chosen": -364.14666748046875, - "logps/rejected": -337.1743469238281, - "loss": 0.1198, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.1708009243011475, - "rewards/margins": 5.978755950927734, - "rewards/rejected": -4.807954788208008, + "epoch": 0.72, + "learning_rate": 4.2178236756550007e-07, + "logits/chosen": -2.904174566268921, + "logits/rejected": -2.96537184715271, + "logps/chosen": -301.59210205078125, + "logps/rejected": -297.4707946777344, + "loss": 0.6129, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.25321847200393677, + "rewards/margins": 2.3061304092407227, + "rewards/rejected": -2.0529122352600098, "step": 1400 }, { - "epoch": 1.45, - "eval_logits/chosen": -2.8625848293304443, - "eval_logits/rejected": -2.8527026176452637, - "eval_logps/chosen": -356.7679443359375, - "eval_logps/rejected": -308.636962890625, - "eval_loss": 0.5245142579078674, - "eval_rewards/accuracies": 0.7720000147819519, - "eval_rewards/chosen": -0.390207976102829, - "eval_rewards/margins": 2.0615620613098145, - "eval_rewards/rejected": -2.451770067214966, - "eval_runtime": 499.4895, - "eval_samples_per_second": 4.004, - "eval_steps_per_second": 0.501, + "epoch": 0.72, + "eval_logits/chosen": -2.9437789916992188, + "eval_logits/rejected": -3.0072262287139893, + "eval_logps/chosen": -242.10218811035156, + "eval_logps/rejected": -298.38385009765625, + "eval_loss": 0.4817677140235901, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": 0.5577969551086426, + "eval_rewards/margins": 2.7824389934539795, + "eval_rewards/rejected": -2.224642038345337, + "eval_runtime": 279.044, + "eval_samples_per_second": 7.167, + "eval_steps_per_second": 0.448, "step": 1400 }, { - "epoch": 1.46, - "learning_rate": 2.858783008036739e-07, - "logits/chosen": -2.8276944160461426, - "logits/rejected": -2.841846466064453, - "logps/chosen": -348.5047302246094, - "logps/rejected": -304.74224853515625, - "loss": 0.1106, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3548219203948975, - "rewards/margins": 5.720047950744629, - "rewards/rejected": -4.365225791931152, + "epoch": 0.73, + "learning_rate": 4.208261617900172e-07, + "logits/chosen": -2.973268747329712, + "logits/rejected": -3.0276944637298584, + "logps/chosen": -262.35491943359375, + "logps/rejected": -244.64102172851562, + "loss": 0.4874, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5497137904167175, + "rewards/margins": 2.7235116958618164, + "rewards/rejected": -2.173797607421875, "step": 1410 }, { - "epoch": 1.47, - "learning_rate": 2.8396479142747797e-07, - "logits/chosen": -2.8640191555023193, - "logits/rejected": -2.8874547481536865, - "logps/chosen": -309.53729248046875, - "logps/rejected": -313.78900146484375, - "loss": 0.1083, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2925539016723633, - "rewards/margins": 5.640458106994629, - "rewards/rejected": -4.347904682159424, + "epoch": 0.73, + "learning_rate": 4.198699560145343e-07, + "logits/chosen": -2.862189292907715, + "logits/rejected": -2.9037973880767822, + "logps/chosen": -240.8940887451172, + "logps/rejected": -295.91339111328125, + "loss": 0.6516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.6252197623252869, + "rewards/margins": 2.2482056617736816, + "rewards/rejected": -1.6229861974716187, "step": 1420 }, { - "epoch": 1.48, - "learning_rate": 2.8205128205128203e-07, - "logits/chosen": -2.8046693801879883, - "logits/rejected": -2.796954393386841, - "logps/chosen": -321.08740234375, - "logps/rejected": -325.5114440917969, - "loss": 0.1134, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.1036927700042725, - "rewards/margins": 5.514297962188721, - "rewards/rejected": -4.410605430603027, + "epoch": 0.74, + "learning_rate": 4.189137502390514e-07, + "logits/chosen": -2.931879997253418, + "logits/rejected": -2.965914487838745, + "logps/chosen": -259.8681335449219, + "logps/rejected": -316.9034729003906, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13505953550338745, + "rewards/margins": 2.2420730590820312, + "rewards/rejected": -2.107013702392578, "step": 1430 }, { - "epoch": 1.49, - "learning_rate": 2.801377726750861e-07, - "logits/chosen": -2.8305039405822754, - "logits/rejected": -2.8174493312835693, - "logps/chosen": -316.93365478515625, - "logps/rejected": -325.77801513671875, - "loss": 0.1059, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.1122941970825195, - "rewards/margins": 5.836568832397461, - "rewards/rejected": -4.724274158477783, + "epoch": 0.74, + "learning_rate": 4.179575444635686e-07, + "logits/chosen": -2.9142744541168213, + "logits/rejected": -2.95097279548645, + "logps/chosen": -305.0422668457031, + "logps/rejected": -317.8971252441406, + "loss": 0.4804, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6268448233604431, + "rewards/margins": 2.3850245475769043, + "rewards/rejected": -1.7581799030303955, "step": 1440 }, { - "epoch": 1.5, - "learning_rate": 2.7822426329889015e-07, - "logits/chosen": -2.8809874057769775, - "logits/rejected": -2.8768084049224854, - "logps/chosen": -316.82830810546875, - "logps/rejected": -330.55633544921875, - "loss": 0.1244, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0005091428756714, - "rewards/margins": 5.6459479331970215, - "rewards/rejected": -4.645439147949219, + "epoch": 0.75, + "learning_rate": 4.170013386880857e-07, + "logits/chosen": -2.8860132694244385, + "logits/rejected": -2.9283335208892822, + "logps/chosen": -219.405029296875, + "logps/rejected": -327.70123291015625, + "loss": 0.4487, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6366171836853027, + "rewards/margins": 2.8044886589050293, + "rewards/rejected": -2.1678714752197266, "step": 1450 }, { - "epoch": 1.51, - "learning_rate": 2.763107539226942e-07, - "logits/chosen": -2.9045822620391846, - "logits/rejected": -2.8883230686187744, - "logps/chosen": -330.44024658203125, - "logps/rejected": -350.8652648925781, - "loss": 0.109, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.5636141300201416, - "rewards/margins": 6.590689182281494, - "rewards/rejected": -5.027074813842773, + "epoch": 0.75, + "learning_rate": 4.1604513291260277e-07, + "logits/chosen": -2.8814711570739746, + "logits/rejected": -2.9225306510925293, + "logps/chosen": -218.7816162109375, + "logps/rejected": -295.43267822265625, + "loss": 0.5484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5335996150970459, + "rewards/margins": 2.8809235095977783, + "rewards/rejected": -2.3473238945007324, "step": 1460 }, { - "epoch": 1.52, - "learning_rate": 2.743972445464983e-07, - "logits/chosen": -2.889626979827881, - "logits/rejected": -2.865269422531128, - "logps/chosen": -308.53887939453125, - "logps/rejected": -303.1893005371094, - "loss": 0.103, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.100376844406128, - "rewards/margins": 5.918498992919922, - "rewards/rejected": -4.818121910095215, + "epoch": 0.76, + "learning_rate": 4.150889271371199e-07, + "logits/chosen": -2.804237127304077, + "logits/rejected": -2.8750622272491455, + "logps/chosen": -279.5050964355469, + "logps/rejected": -276.02874755859375, + "loss": 0.5363, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5062381625175476, + "rewards/margins": 3.0678353309631348, + "rewards/rejected": -2.5615968704223633, "step": 1470 }, { - "epoch": 1.53, - "learning_rate": 2.7248373517030234e-07, - "logits/chosen": -2.8868496417999268, - "logits/rejected": -2.901533842086792, - "logps/chosen": -335.3681335449219, - "logps/rejected": -349.62115478515625, - "loss": 0.1122, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8411431312561035, - "rewards/margins": 5.649033546447754, - "rewards/rejected": -4.807890892028809, + "epoch": 0.76, + "learning_rate": 4.14132721361637e-07, + "logits/chosen": -2.817847728729248, + "logits/rejected": -2.8680078983306885, + "logps/chosen": -246.18038940429688, + "logps/rejected": -220.83285522460938, + "loss": 0.3628, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5915089249610901, + "rewards/margins": 3.000457286834717, + "rewards/rejected": -2.4089484214782715, "step": 1480 }, { - "epoch": 1.54, - "learning_rate": 2.705702257941064e-07, - "logits/chosen": -2.8572373390197754, - "logits/rejected": -2.869990110397339, - "logps/chosen": -396.15887451171875, - "logps/rejected": -329.05584716796875, - "loss": 0.1019, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.6818903684616089, - "rewards/margins": 5.830107688903809, - "rewards/rejected": -5.148218154907227, + "epoch": 0.77, + "learning_rate": 4.131765155861541e-07, + "logits/chosen": -2.776465892791748, + "logits/rejected": -2.858459949493408, + "logps/chosen": -230.6031494140625, + "logps/rejected": -250.45510864257812, + "loss": 0.4606, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.39864081144332886, + "rewards/margins": 3.0935165882110596, + "rewards/rejected": -2.694875478744507, "step": 1490 }, { - "epoch": 1.55, - "learning_rate": 2.686567164179104e-07, - "logits/chosen": -2.8778789043426514, - "logits/rejected": -2.8655364513397217, - "logps/chosen": -313.30499267578125, - "logps/rejected": -300.9337463378906, - "loss": 0.1122, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5041682720184326, - "rewards/margins": 5.160029411315918, - "rewards/rejected": -4.6558613777160645, + "epoch": 0.77, + "learning_rate": 4.1222030981067123e-07, + "logits/chosen": -2.847076654434204, + "logits/rejected": -2.8960652351379395, + "logps/chosen": -267.3013916015625, + "logps/rejected": -305.4517822265625, + "loss": 0.3862, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.035213030874729156, + "rewards/margins": 2.688610076904297, + "rewards/rejected": -2.6533970832824707, "step": 1500 }, { - "epoch": 1.55, - "eval_logits/chosen": -2.8394267559051514, - "eval_logits/rejected": -2.831911563873291, - "eval_logps/chosen": -359.64434814453125, - "eval_logps/rejected": -315.60784912109375, - "eval_loss": 0.552358865737915, - "eval_rewards/accuracies": 0.7860000133514404, - "eval_rewards/chosen": -0.6778488755226135, - "eval_rewards/margins": 2.471008539199829, - "eval_rewards/rejected": -3.1488571166992188, - "eval_runtime": 499.603, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 0.77, + "eval_logits/chosen": -2.8354265689849854, + "eval_logits/rejected": -2.897573947906494, + "eval_logps/chosen": -244.42625427246094, + "eval_logps/rejected": -302.6622009277344, + "eval_loss": 0.4689019024372101, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": 0.32539257407188416, + "eval_rewards/margins": 2.97786808013916, + "eval_rewards/rejected": -2.652475595474243, + "eval_runtime": 278.9347, + "eval_samples_per_second": 7.17, + "eval_steps_per_second": 0.448, "step": 1500 }, { - "epoch": 1.56, - "learning_rate": 2.6674320704171447e-07, - "logits/chosen": -2.8329129219055176, - "logits/rejected": -2.8036797046661377, - "logps/chosen": -313.36224365234375, - "logps/rejected": -286.71441650390625, - "loss": 0.0965, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.6077798008918762, - "rewards/margins": 5.281628608703613, - "rewards/rejected": -4.673849105834961, + "epoch": 0.78, + "learning_rate": 4.1126410403518835e-07, + "logits/chosen": -2.8334925174713135, + "logits/rejected": -2.8591761589050293, + "logps/chosen": -221.348876953125, + "logps/rejected": -311.17877197265625, + "loss": 0.4741, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.015551751479506493, + "rewards/margins": 2.4997267723083496, + "rewards/rejected": -2.484175443649292, "step": 1510 }, { - "epoch": 1.57, - "learning_rate": 2.6482969766551853e-07, - "logits/chosen": -2.8273487091064453, - "logits/rejected": -2.837484121322632, - "logps/chosen": -315.29852294921875, - "logps/rejected": -347.20343017578125, - "loss": 0.1045, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.0901168584823608, - "rewards/margins": 5.834365367889404, - "rewards/rejected": -4.744248390197754, + "epoch": 0.78, + "learning_rate": 4.1030789825970546e-07, + "logits/chosen": -2.754701852798462, + "logits/rejected": -2.8493895530700684, + "logps/chosen": -275.2632751464844, + "logps/rejected": -338.8531494140625, + "loss": 0.4382, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5648075342178345, + "rewards/margins": 3.1909804344177246, + "rewards/rejected": -2.6261725425720215, "step": 1520 }, { - "epoch": 1.58, - "learning_rate": 2.629161882893226e-07, - "logits/chosen": -2.8795628547668457, - "logits/rejected": -2.862488269805908, - "logps/chosen": -348.51812744140625, - "logps/rejected": -306.18621826171875, - "loss": 0.0929, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8217048645019531, - "rewards/margins": 5.343213081359863, - "rewards/rejected": -4.52150821685791, + "epoch": 0.79, + "learning_rate": 4.093516924842226e-07, + "logits/chosen": -2.797532558441162, + "logits/rejected": -2.8553106784820557, + "logps/chosen": -282.1953125, + "logps/rejected": -335.3280334472656, + "loss": 0.3387, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.3751303553581238, + "rewards/margins": 3.3121883869171143, + "rewards/rejected": -2.9370579719543457, "step": 1530 }, { - "epoch": 1.59, - "learning_rate": 2.6100267891312666e-07, - "logits/chosen": -2.8448562622070312, - "logits/rejected": -2.8945586681365967, - "logps/chosen": -353.49822998046875, - "logps/rejected": -330.5164794921875, - "loss": 0.1189, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.4604480266571045, - "rewards/margins": 6.311491966247559, - "rewards/rejected": -4.851044178009033, + "epoch": 0.8, + "learning_rate": 4.083954867087397e-07, + "logits/chosen": -2.895432710647583, + "logits/rejected": -2.9414145946502686, + "logps/chosen": -220.44735717773438, + "logps/rejected": -296.5469970703125, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16891750693321228, + "rewards/margins": 2.6762678623199463, + "rewards/rejected": -2.507350444793701, "step": 1540 }, { - "epoch": 1.6, - "learning_rate": 2.590891695369307e-07, - "logits/chosen": -2.8233683109283447, - "logits/rejected": -2.811624050140381, - "logps/chosen": -303.26861572265625, - "logps/rejected": -277.67401123046875, - "loss": 0.1288, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.3482983112335205, - "rewards/margins": 5.738016605377197, - "rewards/rejected": -4.389718055725098, + "epoch": 0.8, + "learning_rate": 4.074392809332568e-07, + "logits/chosen": -2.8410191535949707, + "logits/rejected": -2.9066405296325684, + "logps/chosen": -316.89971923828125, + "logps/rejected": -309.38934326171875, + "loss": 0.4013, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.8427516222000122, + "rewards/margins": 3.9204134941101074, + "rewards/rejected": -3.0776619911193848, "step": 1550 }, { - "epoch": 1.61, - "learning_rate": 2.571756601607348e-07, - "logits/chosen": -2.8882668018341064, - "logits/rejected": -2.8739676475524902, - "logps/chosen": -373.87176513671875, - "logps/rejected": -320.65447998046875, - "loss": 0.1383, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.4747350215911865, - "rewards/margins": 6.119807243347168, - "rewards/rejected": -4.645071983337402, + "epoch": 0.81, + "learning_rate": 4.064830751577739e-07, + "logits/chosen": -2.889589548110962, + "logits/rejected": -2.917020320892334, + "logps/chosen": -243.4337921142578, + "logps/rejected": -297.63690185546875, + "loss": 0.4869, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4400361478328705, + "rewards/margins": 2.95405912399292, + "rewards/rejected": -2.5140228271484375, "step": 1560 }, { - "epoch": 1.62, - "learning_rate": 2.5526215078453884e-07, - "logits/chosen": -2.880866527557373, - "logits/rejected": -2.874523878097534, - "logps/chosen": -335.191650390625, - "logps/rejected": -368.77197265625, - "loss": 0.0904, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.1102583408355713, - "rewards/margins": 6.102787017822266, - "rewards/rejected": -4.992527961730957, + "epoch": 0.81, + "learning_rate": 4.0552686938229104e-07, + "logits/chosen": -2.902555227279663, + "logits/rejected": -2.956498384475708, + "logps/chosen": -219.6063232421875, + "logps/rejected": -291.71685791015625, + "loss": 0.4151, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.315045028924942, + "rewards/margins": 2.819603204727173, + "rewards/rejected": -2.5045580863952637, "step": 1570 }, { - "epoch": 1.63, - "learning_rate": 2.533486414083429e-07, - "logits/chosen": -2.92406964302063, - "logits/rejected": -2.9168031215667725, - "logps/chosen": -344.280517578125, - "logps/rejected": -319.5175476074219, - "loss": 0.0894, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.228049874305725, - "rewards/margins": 5.966233253479004, - "rewards/rejected": -4.738183498382568, + "epoch": 0.82, + "learning_rate": 4.045706636068082e-07, + "logits/chosen": -2.898637056350708, + "logits/rejected": -2.9533817768096924, + "logps/chosen": -252.69180297851562, + "logps/rejected": -284.62744140625, + "loss": 0.4136, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.10596165806055069, + "rewards/margins": 3.081064462661743, + "rewards/rejected": -3.187026262283325, "step": 1580 }, { - "epoch": 1.64, - "learning_rate": 2.5143513203214697e-07, - "logits/chosen": -2.846970796585083, - "logits/rejected": -2.824373245239258, - "logps/chosen": -339.29522705078125, - "logps/rejected": -325.4171142578125, - "loss": 0.0822, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.318579077720642, - "rewards/margins": 6.132169246673584, - "rewards/rejected": -4.813591003417969, + "epoch": 0.82, + "learning_rate": 4.036144578313253e-07, + "logits/chosen": -2.8751070499420166, + "logits/rejected": -2.9103922843933105, + "logps/chosen": -226.5676727294922, + "logps/rejected": -296.97467041015625, + "loss": 0.4177, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16602222621440887, + "rewards/margins": 2.9820897579193115, + "rewards/rejected": -2.8160674571990967, "step": 1590 }, { - "epoch": 1.65, - "learning_rate": 2.49521622655951e-07, - "logits/chosen": -2.776440143585205, - "logits/rejected": -2.780971050262451, - "logps/chosen": -346.0995178222656, - "logps/rejected": -348.487060546875, - "loss": 0.11, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.384293556213379, - "rewards/margins": 5.884185791015625, - "rewards/rejected": -4.499892234802246, + "epoch": 0.83, + "learning_rate": 4.0265825205584244e-07, + "logits/chosen": -2.88401460647583, + "logits/rejected": -2.9255869388580322, + "logps/chosen": -281.10113525390625, + "logps/rejected": -315.5570068359375, + "loss": 0.4186, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1843784749507904, + "rewards/margins": 2.7065744400024414, + "rewards/rejected": -2.8909528255462646, "step": 1600 }, { - "epoch": 1.65, - "eval_logits/chosen": -2.855889320373535, - "eval_logits/rejected": -2.846108913421631, - "eval_logps/chosen": -357.83306884765625, - "eval_logps/rejected": -311.6435241699219, - "eval_loss": 0.5355437397956848, - "eval_rewards/accuracies": 0.777999997138977, - "eval_rewards/chosen": -0.4967198967933655, - "eval_rewards/margins": 2.2557058334350586, - "eval_rewards/rejected": -2.7524256706237793, - "eval_runtime": 499.2286, - "eval_samples_per_second": 4.006, - "eval_steps_per_second": 0.501, + "epoch": 0.83, + "eval_logits/chosen": -2.8588759899139404, + "eval_logits/rejected": -2.920731544494629, + "eval_logps/chosen": -244.61883544921875, + "eval_logps/rejected": -305.6510925292969, + "eval_loss": 0.4497062563896179, + "eval_rewards/accuracies": 0.8040000200271606, + "eval_rewards/chosen": 0.3061320185661316, + "eval_rewards/margins": 3.2575008869171143, + "eval_rewards/rejected": -2.951368570327759, + "eval_runtime": 279.1026, + "eval_samples_per_second": 7.166, + "eval_steps_per_second": 0.448, "step": 1600 }, { - "epoch": 1.66, - "learning_rate": 2.4760811327975504e-07, - "logits/chosen": -2.8780884742736816, - "logits/rejected": -2.892906665802002, - "logps/chosen": -377.72430419921875, - "logps/rejected": -332.82794189453125, - "loss": 0.1108, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 1.1861491203308105, - "rewards/margins": 5.642457485198975, - "rewards/rejected": -4.456308364868164, + "epoch": 0.83, + "learning_rate": 4.0170204628035956e-07, + "logits/chosen": -2.9068946838378906, + "logits/rejected": -2.9520983695983887, + "logps/chosen": -196.4069061279297, + "logps/rejected": -263.8167419433594, + "loss": 0.4258, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.18958450853824615, + "rewards/margins": 3.1812098026275635, + "rewards/rejected": -2.9916253089904785, "step": 1610 }, { - "epoch": 1.67, - "learning_rate": 2.456946039035591e-07, - "logits/chosen": -2.8738551139831543, - "logits/rejected": -2.876325845718384, - "logps/chosen": -366.89691162109375, - "logps/rejected": -316.91021728515625, - "loss": 0.1101, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4744036197662354, - "rewards/margins": 5.994829177856445, - "rewards/rejected": -4.520425319671631, + "epoch": 0.84, + "learning_rate": 4.007458405048766e-07, + "logits/chosen": -2.880481243133545, + "logits/rejected": -2.923464298248291, + "logps/chosen": -273.96710205078125, + "logps/rejected": -340.50286865234375, + "loss": 0.4167, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3789766728878021, + "rewards/margins": 3.0592124462127686, + "rewards/rejected": -2.6802358627319336, "step": 1620 }, { - "epoch": 1.68, - "learning_rate": 2.4378109452736316e-07, - "logits/chosen": -2.869784355163574, - "logits/rejected": -2.8577165603637695, - "logps/chosen": -334.3713073730469, - "logps/rejected": -318.83404541015625, - "loss": 0.1145, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.051092505455017, - "rewards/margins": 5.565567970275879, - "rewards/rejected": -4.5144758224487305, + "epoch": 0.84, + "learning_rate": 3.9978963472939373e-07, + "logits/chosen": -2.9000725746154785, + "logits/rejected": -2.9560017585754395, + "logps/chosen": -260.95379638671875, + "logps/rejected": -287.95977783203125, + "loss": 0.4137, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.29968172311782837, + "rewards/margins": 3.1619606018066406, + "rewards/rejected": -2.862278938293457, "step": 1630 }, { - "epoch": 1.69, - "learning_rate": 2.418675851511672e-07, - "logits/chosen": -2.8268637657165527, - "logits/rejected": -2.8105154037475586, - "logps/chosen": -351.9185791015625, - "logps/rejected": -307.3794860839844, - "loss": 0.081, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2795428037643433, - "rewards/margins": 5.681187629699707, - "rewards/rejected": -4.401645183563232, + "epoch": 0.85, + "learning_rate": 3.9883342895391085e-07, + "logits/chosen": -2.937865734100342, + "logits/rejected": -2.9621806144714355, + "logps/chosen": -268.20404052734375, + "logps/rejected": -301.2460021972656, + "loss": 0.4676, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.061094582080841064, + "rewards/margins": 3.1974027156829834, + "rewards/rejected": -3.136308193206787, "step": 1640 }, { - "epoch": 1.7, - "learning_rate": 2.399540757749713e-07, - "logits/chosen": -2.8763937950134277, - "logits/rejected": -2.880305767059326, - "logps/chosen": -334.76763916015625, - "logps/rejected": -356.3011169433594, - "loss": 0.1503, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.0690113306045532, - "rewards/margins": 5.896848678588867, - "rewards/rejected": -4.8278374671936035, + "epoch": 0.85, + "learning_rate": 3.9787722317842796e-07, + "logits/chosen": -2.9573769569396973, + "logits/rejected": -3.0099475383758545, + "logps/chosen": -239.37808227539062, + "logps/rejected": -249.80813598632812, + "loss": 0.444, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.42435941100120544, + "rewards/margins": 3.010646104812622, + "rewards/rejected": -2.586287021636963, "step": 1650 }, { - "epoch": 1.71, - "learning_rate": 2.3804056639877535e-07, - "logits/chosen": -2.8334081172943115, - "logits/rejected": -2.8573122024536133, - "logps/chosen": -297.3508605957031, - "logps/rejected": -295.17132568359375, - "loss": 0.1173, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 1.0110653638839722, - "rewards/margins": 5.1915178298950195, - "rewards/rejected": -4.1804518699646, + "epoch": 0.86, + "learning_rate": 3.969210174029451e-07, + "logits/chosen": -2.910501003265381, + "logits/rejected": -2.9044454097747803, + "logps/chosen": -292.47503662109375, + "logps/rejected": -347.2369384765625, + "loss": 0.4663, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.30727043747901917, + "rewards/margins": 2.7217769622802734, + "rewards/rejected": -2.414506435394287, "step": 1660 }, { - "epoch": 1.72, - "learning_rate": 2.361270570225794e-07, - "logits/chosen": -2.8628039360046387, - "logits/rejected": -2.847696542739868, - "logps/chosen": -349.5933837890625, - "logps/rejected": -297.1512451171875, - "loss": 0.0924, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.53627610206604, - "rewards/margins": 5.680062770843506, - "rewards/rejected": -4.143786430358887, + "epoch": 0.86, + "learning_rate": 3.959648116274622e-07, + "logits/chosen": -2.8908803462982178, + "logits/rejected": -2.9170987606048584, + "logps/chosen": -234.23165893554688, + "logps/rejected": -281.02850341796875, + "loss": 0.5076, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2927956283092499, + "rewards/margins": 3.0067875385284424, + "rewards/rejected": -2.71399188041687, "step": 1670 }, { - "epoch": 1.74, - "learning_rate": 2.3421354764638345e-07, - "logits/chosen": -2.8902525901794434, - "logits/rejected": -2.8761532306671143, - "logps/chosen": -328.5924377441406, - "logps/rejected": -352.31756591796875, - "loss": 0.0937, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.1631652116775513, - "rewards/margins": 5.828993320465088, - "rewards/rejected": -4.665827751159668, + "epoch": 0.87, + "learning_rate": 3.950086058519793e-07, + "logits/chosen": -2.9678280353546143, + "logits/rejected": -3.010523796081543, + "logps/chosen": -209.88211059570312, + "logps/rejected": -288.1944274902344, + "loss": 0.5021, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3828034996986389, + "rewards/margins": 2.6173367500305176, + "rewards/rejected": -3.000140428543091, "step": 1680 }, { - "epoch": 1.75, - "learning_rate": 2.323000382701875e-07, - "logits/chosen": -2.8299720287323, - "logits/rejected": -2.8690247535705566, - "logps/chosen": -341.2825622558594, - "logps/rejected": -324.95574951171875, - "loss": 0.1036, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.3289188146591187, - "rewards/margins": 5.7310357093811035, - "rewards/rejected": -4.402116775512695, + "epoch": 0.87, + "learning_rate": 3.9405240007649643e-07, + "logits/chosen": -2.9930944442749023, + "logits/rejected": -3.051088333129883, + "logps/chosen": -218.7351531982422, + "logps/rejected": -301.77264404296875, + "loss": 0.4626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22515520453453064, + "rewards/margins": 2.75825834274292, + "rewards/rejected": -2.9834134578704834, "step": 1690 }, { - "epoch": 1.76, - "learning_rate": 2.3038652889399157e-07, - "logits/chosen": -2.8278417587280273, - "logits/rejected": -2.8181910514831543, - "logps/chosen": -305.45843505859375, - "logps/rejected": -333.2713928222656, - "loss": 0.1092, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8609102368354797, - "rewards/margins": 5.566982269287109, - "rewards/rejected": -4.7060723304748535, + "epoch": 0.88, + "learning_rate": 3.9309619430101354e-07, + "logits/chosen": -2.917217493057251, + "logits/rejected": -2.9686226844787598, + "logps/chosen": -261.61724853515625, + "logps/rejected": -286.07110595703125, + "loss": 0.4765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.12368907034397125, + "rewards/margins": 2.886246919631958, + "rewards/rejected": -2.7625582218170166, "step": 1700 }, { - "epoch": 1.76, - "eval_logits/chosen": -2.8601489067077637, - "eval_logits/rejected": -2.8501791954040527, - "eval_logps/chosen": -358.9333190917969, - "eval_logps/rejected": -315.2887878417969, - "eval_loss": 0.5580697059631348, - "eval_rewards/accuracies": 0.7799999713897705, - "eval_rewards/chosen": -0.606745719909668, - "eval_rewards/margins": 2.510206699371338, - "eval_rewards/rejected": -3.1169521808624268, - "eval_runtime": 499.7093, - "eval_samples_per_second": 4.002, - "eval_steps_per_second": 0.5, + "epoch": 0.88, + "eval_logits/chosen": -2.924076557159424, + "eval_logits/rejected": -2.9836485385894775, + "eval_logps/chosen": -243.892578125, + "eval_logps/rejected": -302.3619384765625, + "eval_loss": 0.42963850498199463, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": 0.3787572979927063, + "eval_rewards/margins": 3.001209020614624, + "eval_rewards/rejected": -2.6224520206451416, + "eval_runtime": 278.6405, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.449, "step": 1700 }, { - "epoch": 1.77, - "learning_rate": 2.2847301951779563e-07, - "logits/chosen": -2.837820529937744, - "logits/rejected": -2.831998825073242, - "logps/chosen": -320.96673583984375, - "logps/rejected": -343.4007568359375, - "loss": 0.094, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5116377472877502, - "rewards/margins": 5.757063388824463, - "rewards/rejected": -5.245425224304199, + "epoch": 0.88, + "learning_rate": 3.9213998852553066e-07, + "logits/chosen": -2.944732904434204, + "logits/rejected": -2.988416910171509, + "logps/chosen": -319.26422119140625, + "logps/rejected": -304.89996337890625, + "loss": 0.4007, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.012713325209915638, + "rewards/margins": 2.6510703563690186, + "rewards/rejected": -2.638357162475586, "step": 1710 }, { - "epoch": 1.78, - "learning_rate": 2.265595101415997e-07, - "logits/chosen": -2.756493091583252, - "logits/rejected": -2.763211488723755, - "logps/chosen": -317.48724365234375, - "logps/rejected": -356.63848876953125, - "loss": 0.0945, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.9465705156326294, - "rewards/margins": 6.130054473876953, - "rewards/rejected": -5.1834845542907715, + "epoch": 0.89, + "learning_rate": 3.9118378275004783e-07, + "logits/chosen": -2.9039697647094727, + "logits/rejected": -2.9630255699157715, + "logps/chosen": -222.86685180664062, + "logps/rejected": -363.20196533203125, + "loss": 0.4875, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.03015981614589691, + "rewards/margins": 2.9296727180480957, + "rewards/rejected": -2.899512767791748, "step": 1720 }, { - "epoch": 1.79, - "learning_rate": 2.2464600076540373e-07, - "logits/chosen": -2.8534083366394043, - "logits/rejected": -2.858447551727295, - "logps/chosen": -358.9150085449219, - "logps/rejected": -313.8974914550781, - "loss": 0.1016, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 1.2323501110076904, - "rewards/margins": 5.758856773376465, - "rewards/rejected": -4.526506423950195, + "epoch": 0.89, + "learning_rate": 3.9022757697456494e-07, + "logits/chosen": -2.8391387462615967, + "logits/rejected": -2.943779230117798, + "logps/chosen": -302.57977294921875, + "logps/rejected": -337.5728759765625, + "loss": 0.4998, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.36434197425842285, + "rewards/margins": 2.9870846271514893, + "rewards/rejected": -2.6227424144744873, "step": 1730 }, { - "epoch": 1.8, - "learning_rate": 2.227324913892078e-07, - "logits/chosen": -2.912522077560425, - "logits/rejected": -2.897264003753662, - "logps/chosen": -322.6852111816406, - "logps/rejected": -323.4605407714844, - "loss": 0.1225, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.682808518409729, - "rewards/margins": 5.444243907928467, - "rewards/rejected": -4.761435508728027, + "epoch": 0.9, + "learning_rate": 3.8927137119908206e-07, + "logits/chosen": -2.919332981109619, + "logits/rejected": -2.9586234092712402, + "logps/chosen": -288.93377685546875, + "logps/rejected": -283.7138977050781, + "loss": 0.4699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.00860520638525486, + "rewards/margins": 2.815186023712158, + "rewards/rejected": -2.823791742324829, "step": 1740 }, { - "epoch": 1.81, - "learning_rate": 2.2081898201301186e-07, - "logits/chosen": -2.934340715408325, - "logits/rejected": -2.9151082038879395, - "logps/chosen": -325.00506591796875, - "logps/rejected": -297.88970947265625, - "loss": 0.1372, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 0.8912375569343567, - "rewards/margins": 5.291516304016113, - "rewards/rejected": -4.400278568267822, + "epoch": 0.9, + "learning_rate": 3.883151654235992e-07, + "logits/chosen": -2.885148286819458, + "logits/rejected": -2.940183401107788, + "logps/chosen": -286.89007568359375, + "logps/rejected": -312.8641662597656, + "loss": 0.4729, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12667697668075562, + "rewards/margins": 2.965704917907715, + "rewards/rejected": -3.0923824310302734, "step": 1750 }, { - "epoch": 1.82, - "learning_rate": 2.1890547263681592e-07, - "logits/chosen": -2.913761615753174, - "logits/rejected": -2.885958671569824, - "logps/chosen": -306.1551818847656, - "logps/rejected": -300.98052978515625, - "loss": 0.0873, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.9583765268325806, - "rewards/margins": 5.630650520324707, - "rewards/rejected": -4.672274589538574, + "epoch": 0.91, + "learning_rate": 3.873589596481163e-07, + "logits/chosen": -2.898613929748535, + "logits/rejected": -2.983856439590454, + "logps/chosen": -218.9492645263672, + "logps/rejected": -316.5523986816406, + "loss": 0.5033, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.018303874880075455, + "rewards/margins": 3.3192203044891357, + "rewards/rejected": -3.3375244140625, "step": 1760 }, { - "epoch": 1.83, - "learning_rate": 2.1699196326061998e-07, - "logits/chosen": -2.8752079010009766, - "logits/rejected": -2.8725571632385254, - "logps/chosen": -303.1614685058594, - "logps/rejected": -306.0841369628906, - "loss": 0.2217, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.1173279285430908, - "rewards/margins": 5.881751537322998, - "rewards/rejected": -4.76442289352417, + "epoch": 0.91, + "learning_rate": 3.864027538726334e-07, + "logits/chosen": -2.874910593032837, + "logits/rejected": -2.951063394546509, + "logps/chosen": -181.5255584716797, + "logps/rejected": -294.26025390625, + "loss": 0.4505, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.6372448205947876, + "rewards/margins": 3.9570529460906982, + "rewards/rejected": -3.3198082447052, "step": 1770 }, { - "epoch": 1.84, - "learning_rate": 2.1507845388442402e-07, - "logits/chosen": -2.869429588317871, - "logits/rejected": -2.8998007774353027, - "logps/chosen": -333.4254455566406, - "logps/rejected": -317.5960998535156, - "loss": 0.0849, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.9832474589347839, - "rewards/margins": 5.767635822296143, - "rewards/rejected": -4.784388065338135, + "epoch": 0.92, + "learning_rate": 3.8544654809715047e-07, + "logits/chosen": -2.9166131019592285, + "logits/rejected": -2.9520506858825684, + "logps/chosen": -233.5117645263672, + "logps/rejected": -308.47698974609375, + "loss": 0.4564, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.3196285665035248, + "rewards/margins": 2.8492321968078613, + "rewards/rejected": -3.168860673904419, "step": 1780 }, { - "epoch": 1.85, - "learning_rate": 2.1316494450822808e-07, - "logits/chosen": -2.8548214435577393, - "logits/rejected": -2.875349521636963, - "logps/chosen": -364.0671691894531, - "logps/rejected": -348.7872009277344, - "loss": 0.0693, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.9190314412117004, - "rewards/margins": 6.185535430908203, - "rewards/rejected": -5.266503810882568, + "epoch": 0.92, + "learning_rate": 3.844903423216676e-07, + "logits/chosen": -2.9044277667999268, + "logits/rejected": -2.9424145221710205, + "logps/chosen": -222.7826690673828, + "logps/rejected": -291.5748596191406, + "loss": 0.4887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.017815064638853073, + "rewards/margins": 2.7665205001831055, + "rewards/rejected": -2.7843356132507324, "step": 1790 }, { - "epoch": 1.86, - "learning_rate": 2.1125143513203214e-07, - "logits/chosen": -2.9028079509735107, - "logits/rejected": -2.8680977821350098, - "logps/chosen": -333.8460388183594, - "logps/rejected": -318.41485595703125, - "loss": 0.0958, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.7885669469833374, - "rewards/margins": 5.999600410461426, - "rewards/rejected": -5.211032867431641, + "epoch": 0.93, + "learning_rate": 3.835341365461847e-07, + "logits/chosen": -2.8614282608032227, + "logits/rejected": -2.894472122192383, + "logps/chosen": -240.697021484375, + "logps/rejected": -293.87872314453125, + "loss": 0.4783, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.362933486700058, + "rewards/margins": 2.6759090423583984, + "rewards/rejected": -3.0388426780700684, "step": 1800 }, { - "epoch": 1.86, - "eval_logits/chosen": -2.8540711402893066, - "eval_logits/rejected": -2.84735107421875, - "eval_logps/chosen": -360.1445617675781, - "eval_logps/rejected": -316.60345458984375, - "eval_loss": 0.5646550059318542, - "eval_rewards/accuracies": 0.7760000228881836, - "eval_rewards/chosen": -0.727875292301178, - "eval_rewards/margins": 2.5205445289611816, - "eval_rewards/rejected": -3.248420000076294, - "eval_runtime": 499.6295, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 0.93, + "eval_logits/chosen": -2.886509418487549, + "eval_logits/rejected": -2.953368902206421, + "eval_logps/chosen": -246.73577880859375, + "eval_logps/rejected": -306.0054931640625, + "eval_loss": 0.44217291474342346, + "eval_rewards/accuracies": 0.8040000200271606, + "eval_rewards/chosen": 0.09443826228380203, + "eval_rewards/margins": 3.081247329711914, + "eval_rewards/rejected": -2.986809015274048, + "eval_runtime": 278.8656, + "eval_samples_per_second": 7.172, + "eval_steps_per_second": 0.448, "step": 1800 }, { - "epoch": 1.87, - "learning_rate": 2.093379257558362e-07, - "logits/chosen": -2.8835785388946533, - "logits/rejected": -2.8600425720214844, - "logps/chosen": -308.075439453125, - "logps/rejected": -318.9173278808594, - "loss": 0.1363, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.6390537023544312, - "rewards/margins": 5.642648220062256, - "rewards/rejected": -5.003594398498535, + "epoch": 0.93, + "learning_rate": 3.825779307707018e-07, + "logits/chosen": -2.8623242378234863, + "logits/rejected": -2.9309682846069336, + "logps/chosen": -133.83680725097656, + "logps/rejected": -273.0003662109375, + "loss": 0.4085, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.12987324595451355, + "rewards/margins": 3.0977888107299805, + "rewards/rejected": -2.9679155349731445, "step": 1810 }, { - "epoch": 1.88, - "learning_rate": 2.0742441637964026e-07, - "logits/chosen": -2.807243824005127, - "logits/rejected": -2.837427854537964, - "logps/chosen": -361.7774353027344, - "logps/rejected": -323.4786071777344, - "loss": 0.1082, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 1.1513266563415527, - "rewards/margins": 6.465740203857422, - "rewards/rejected": -5.314412593841553, + "epoch": 0.94, + "learning_rate": 3.8162172499521893e-07, + "logits/chosen": -2.8795676231384277, + "logits/rejected": -2.9275200366973877, + "logps/chosen": -240.89450073242188, + "logps/rejected": -249.7790985107422, + "loss": 0.5003, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2536230683326721, + "rewards/margins": 3.140211582183838, + "rewards/rejected": -2.8865885734558105, "step": 1820 }, { - "epoch": 1.89, - "learning_rate": 2.055109070034443e-07, - "logits/chosen": -2.865370035171509, - "logits/rejected": -2.8720335960388184, - "logps/chosen": -337.0066223144531, - "logps/rejected": -313.6158752441406, - "loss": 0.1168, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.4502478539943695, - "rewards/margins": 5.214186668395996, - "rewards/rejected": -4.763939380645752, + "epoch": 0.94, + "learning_rate": 3.8066551921973605e-07, + "logits/chosen": -2.8502426147460938, + "logits/rejected": -2.9374024868011475, + "logps/chosen": -202.7273712158203, + "logps/rejected": -271.82916259765625, + "loss": 0.4376, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.48944076895713806, + "rewards/margins": 2.6945252418518066, + "rewards/rejected": -2.2050845623016357, "step": 1830 }, { - "epoch": 1.9, - "learning_rate": 2.0359739762724836e-07, - "logits/chosen": -2.8631224632263184, - "logits/rejected": -2.8733620643615723, - "logps/chosen": -346.0338134765625, - "logps/rejected": -347.346435546875, - "loss": 0.0798, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.661190927028656, - "rewards/margins": 6.034590721130371, - "rewards/rejected": -5.373399257659912, + "epoch": 0.95, + "learning_rate": 3.7970931344425316e-07, + "logits/chosen": -2.878389358520508, + "logits/rejected": -2.899590015411377, + "logps/chosen": -236.1371307373047, + "logps/rejected": -273.2381591796875, + "loss": 0.345, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.30246537923812866, + "rewards/margins": 2.5328197479248047, + "rewards/rejected": -2.2303545475006104, "step": 1840 }, { - "epoch": 1.91, - "learning_rate": 2.0168388825105242e-07, - "logits/chosen": -2.901468276977539, - "logits/rejected": -2.9260478019714355, - "logps/chosen": -327.5912780761719, - "logps/rejected": -345.6995544433594, - "loss": 0.0813, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.8747514486312866, - "rewards/margins": 5.987407684326172, - "rewards/rejected": -5.112656593322754, + "epoch": 0.96, + "learning_rate": 3.787531076687703e-07, + "logits/chosen": -2.7709901332855225, + "logits/rejected": -2.8305325508117676, + "logps/chosen": -241.78689575195312, + "logps/rejected": -254.30398559570312, + "loss": 0.5149, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11071042716503143, + "rewards/margins": 2.531583070755005, + "rewards/rejected": -2.642293691635132, "step": 1850 }, { - "epoch": 1.92, - "learning_rate": 1.997703788748565e-07, - "logits/chosen": -2.87198805809021, - "logits/rejected": -2.875633955001831, - "logps/chosen": -346.01959228515625, - "logps/rejected": -317.34527587890625, - "loss": 0.0973, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.6318389177322388, - "rewards/margins": 5.922452449798584, - "rewards/rejected": -5.290614128112793, + "epoch": 0.96, + "learning_rate": 3.7779690189328745e-07, + "logits/chosen": -2.9073691368103027, + "logits/rejected": -2.941469669342041, + "logps/chosen": -241.056884765625, + "logps/rejected": -272.6620178222656, + "loss": 0.4533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14580278098583221, + "rewards/margins": 2.6054463386535645, + "rewards/rejected": -2.751249313354492, "step": 1860 }, { - "epoch": 1.93, - "learning_rate": 1.9785686949866055e-07, - "logits/chosen": -2.863274335861206, - "logits/rejected": -2.876680612564087, - "logps/chosen": -330.6842346191406, - "logps/rejected": -353.137939453125, - "loss": 0.1081, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.135359764099121, - "rewards/margins": 6.191943168640137, - "rewards/rejected": -5.056582927703857, + "epoch": 0.97, + "learning_rate": 3.7684069611780456e-07, + "logits/chosen": -2.8743133544921875, + "logits/rejected": -2.9183545112609863, + "logps/chosen": -288.95733642578125, + "logps/rejected": -294.15338134765625, + "loss": 0.5509, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.27595505118370056, + "rewards/margins": 2.88635516166687, + "rewards/rejected": -2.6103999614715576, "step": 1870 }, { - "epoch": 1.94, - "learning_rate": 1.9594336012246458e-07, - "logits/chosen": -2.888240337371826, - "logits/rejected": -2.903027296066284, - "logps/chosen": -323.24761962890625, - "logps/rejected": -327.24169921875, - "loss": 0.1001, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.8668416738510132, - "rewards/margins": 6.129169464111328, - "rewards/rejected": -5.262328147888184, + "epoch": 0.97, + "learning_rate": 3.758844903423217e-07, + "logits/chosen": -2.9189352989196777, + "logits/rejected": -2.9405598640441895, + "logps/chosen": -203.60948181152344, + "logps/rejected": -290.3697814941406, + "loss": 0.4634, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3353397250175476, + "rewards/margins": 2.54882550239563, + "rewards/rejected": -2.2134859561920166, "step": 1880 }, { - "epoch": 1.95, - "learning_rate": 1.9402985074626865e-07, - "logits/chosen": -2.91502046585083, - "logits/rejected": -2.9133684635162354, - "logps/chosen": -339.2613830566406, - "logps/rejected": -322.16473388671875, - "loss": 0.0919, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.4730144441127777, - "rewards/margins": 5.7007269859313965, - "rewards/rejected": -5.227712154388428, + "epoch": 0.98, + "learning_rate": 3.749282845668388e-07, + "logits/chosen": -2.8900084495544434, + "logits/rejected": -2.9685158729553223, + "logps/chosen": -245.86343383789062, + "logps/rejected": -315.4371643066406, + "loss": 0.4088, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39556270837783813, + "rewards/margins": 2.9713571071624756, + "rewards/rejected": -2.575794219970703, "step": 1890 }, { - "epoch": 1.96, - "learning_rate": 1.921163413700727e-07, - "logits/chosen": -2.9004921913146973, - "logits/rejected": -2.904614210128784, - "logps/chosen": -315.8312072753906, - "logps/rejected": -312.64373779296875, - "loss": 0.122, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.6923476457595825, - "rewards/margins": 6.188453197479248, - "rewards/rejected": -5.4961066246032715, + "epoch": 0.98, + "learning_rate": 3.739720787913559e-07, + "logits/chosen": -2.884178400039673, + "logits/rejected": -2.942009687423706, + "logps/chosen": -269.1268310546875, + "logps/rejected": -278.32073974609375, + "loss": 0.465, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.03307962417602539, + "rewards/margins": 2.16691255569458, + "rewards/rejected": -2.1338324546813965, "step": 1900 }, { - "epoch": 1.96, - "eval_logits/chosen": -2.874333381652832, - "eval_logits/rejected": -2.8706977367401123, - "eval_logps/chosen": -360.60406494140625, - "eval_logps/rejected": -314.73699951171875, - "eval_loss": 0.5520058870315552, - "eval_rewards/accuracies": 0.777999997138977, - "eval_rewards/chosen": -0.773823082447052, - "eval_rewards/margins": 2.2879505157470703, - "eval_rewards/rejected": -3.0617735385894775, - "eval_runtime": 499.5877, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 0.98, + "eval_logits/chosen": -2.871323585510254, + "eval_logits/rejected": -2.9355289936065674, + "eval_logps/chosen": -242.65213012695312, + "eval_logps/rejected": -299.46307373046875, + "eval_loss": 0.4434332847595215, + "eval_rewards/accuracies": 0.7960000038146973, + "eval_rewards/chosen": 0.5028029680252075, + "eval_rewards/margins": 2.835366725921631, + "eval_rewards/rejected": -2.3325634002685547, + "eval_runtime": 278.765, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 0.448, "step": 1900 }, { - "epoch": 1.97, - "learning_rate": 1.9020283199387677e-07, - "logits/chosen": -2.928802251815796, - "logits/rejected": -2.9289422035217285, - "logps/chosen": -347.2261047363281, - "logps/rejected": -324.1834411621094, - "loss": 0.1138, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5844297409057617, - "rewards/margins": 5.202861785888672, - "rewards/rejected": -4.618431568145752, + "epoch": 0.99, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": -2.8448338508605957, + "logits/rejected": -2.9431285858154297, + "logps/chosen": -256.785400390625, + "logps/rejected": -319.0350036621094, + "loss": 0.423, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6242324709892273, + "rewards/margins": 2.828500270843506, + "rewards/rejected": -2.204267978668213, "step": 1910 }, { - "epoch": 1.98, - "learning_rate": 1.8828932261768083e-07, - "logits/chosen": -2.8577020168304443, - "logits/rejected": -2.8607382774353027, - "logps/chosen": -292.01336669921875, - "logps/rejected": -294.7176208496094, - "loss": 0.1066, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.4617652893066406, - "rewards/margins": 5.312861919403076, - "rewards/rejected": -4.8510966300964355, + "epoch": 0.99, + "learning_rate": 3.7205966724039014e-07, + "logits/chosen": -2.7674853801727295, + "logits/rejected": -2.814272403717041, + "logps/chosen": -245.8392791748047, + "logps/rejected": -304.9154968261719, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4677479863166809, + "rewards/margins": 2.518876075744629, + "rewards/rejected": -2.0511279106140137, "step": 1920 }, { - "epoch": 1.99, - "learning_rate": 1.8637581324148487e-07, - "logits/chosen": -2.9372596740722656, - "logits/rejected": -2.918520450592041, - "logps/chosen": -331.2519226074219, - "logps/rejected": -349.9786071777344, - "loss": 0.105, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.7993395924568176, - "rewards/margins": 5.975264549255371, - "rewards/rejected": -5.175924777984619, + "epoch": 1.0, + "learning_rate": 3.711034614649072e-07, + "logits/chosen": -2.857226848602295, + "logits/rejected": -2.9040684700012207, + "logps/chosen": -250.416259765625, + "logps/rejected": -296.86212158203125, + "loss": 0.4355, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.7089163064956665, + "rewards/margins": 3.51921010017395, + "rewards/rejected": -2.810293436050415, "step": 1930 }, { - "epoch": 2.0, - "learning_rate": 1.8446230386528893e-07, - "logits/chosen": -2.8874263763427734, - "logits/rejected": -2.875577926635742, - "logps/chosen": -295.83270263671875, - "logps/rejected": -297.573486328125, - "loss": 0.0743, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7125546932220459, - "rewards/margins": 5.492268085479736, - "rewards/rejected": -4.779712677001953, + "epoch": 1.0, + "learning_rate": 3.701472556894243e-07, + "logits/chosen": -2.783198833465576, + "logits/rejected": -2.8520195484161377, + "logps/chosen": -234.9251251220703, + "logps/rejected": -343.29779052734375, + "loss": 0.3792, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.32631635665893555, + "rewards/margins": 3.4934210777282715, + "rewards/rejected": -3.167104721069336, "step": 1940 }, { - "epoch": 2.01, - "learning_rate": 1.82548794489093e-07, - "logits/chosen": -2.850792646408081, - "logits/rejected": -2.8736298084259033, - "logps/chosen": -342.15423583984375, - "logps/rejected": -383.8853759765625, - "loss": 0.023, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 1.615793228149414, - "rewards/margins": 7.413217067718506, - "rewards/rejected": -5.797423362731934, + "epoch": 1.01, + "learning_rate": 3.6919104991394144e-07, + "logits/chosen": -2.869868755340576, + "logits/rejected": -2.9353652000427246, + "logps/chosen": -200.75564575195312, + "logps/rejected": -300.0337829589844, + "loss": 0.1056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.424803376197815, + "rewards/margins": 5.865682601928711, + "rewards/rejected": -4.4408793449401855, "step": 1950 }, { - "epoch": 2.02, - "learning_rate": 1.8063528511289706e-07, - "logits/chosen": -2.883690357208252, - "logits/rejected": -2.902615785598755, - "logps/chosen": -322.6307067871094, - "logps/rejected": -383.23187255859375, - "loss": 0.0214, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5958327054977417, - "rewards/margins": 6.831275939941406, - "rewards/rejected": -6.235442161560059, + "epoch": 1.01, + "learning_rate": 3.6823484413845855e-07, + "logits/chosen": -2.823423385620117, + "logits/rejected": -2.85945463180542, + "logps/chosen": -277.2393798828125, + "logps/rejected": -334.59613037109375, + "loss": 0.105, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.2674754858016968, + "rewards/margins": 6.0102996826171875, + "rewards/rejected": -4.742823600769043, "step": 1960 }, { - "epoch": 2.03, - "learning_rate": 1.7872177573670112e-07, - "logits/chosen": -2.8846828937530518, - "logits/rejected": -2.893906831741333, - "logps/chosen": -289.3814392089844, - "logps/rejected": -302.682373046875, - "loss": 0.0231, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6643503904342651, - "rewards/margins": 6.625860691070557, - "rewards/rejected": -5.961510181427002, + "epoch": 1.02, + "learning_rate": 3.6727863836297567e-07, + "logits/chosen": -2.8262939453125, + "logits/rejected": -2.855597972869873, + "logps/chosen": -255.54296875, + "logps/rejected": -308.5851135253906, + "loss": 0.1366, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3961846828460693, + "rewards/margins": 6.024592399597168, + "rewards/rejected": -4.628407001495361, "step": 1970 }, { - "epoch": 2.04, - "learning_rate": 1.7680826636050515e-07, - "logits/chosen": -2.87156343460083, - "logits/rejected": -2.8620166778564453, - "logps/chosen": -311.15234375, - "logps/rejected": -340.3420104980469, - "loss": 0.0197, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1246318817138672, - "rewards/margins": 7.64337158203125, - "rewards/rejected": -6.518739223480225, + "epoch": 1.02, + "learning_rate": 3.663224325874928e-07, + "logits/chosen": -2.7664477825164795, + "logits/rejected": -2.8171486854553223, + "logps/chosen": -189.17161560058594, + "logps/rejected": -328.0600280761719, + "loss": 0.0893, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5123683214187622, + "rewards/margins": 6.919796943664551, + "rewards/rejected": -5.407429218292236, "step": 1980 }, { - "epoch": 2.06, - "learning_rate": 1.7489475698430921e-07, - "logits/chosen": -2.8390986919403076, - "logits/rejected": -2.8416831493377686, - "logps/chosen": -321.42474365234375, - "logps/rejected": -331.75921630859375, - "loss": 0.0891, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8244115114212036, - "rewards/margins": 7.307824611663818, - "rewards/rejected": -6.4834136962890625, + "epoch": 1.03, + "learning_rate": 3.653662268120099e-07, + "logits/chosen": -2.76231050491333, + "logits/rejected": -2.8225061893463135, + "logps/chosen": -216.73330688476562, + "logps/rejected": -362.13922119140625, + "loss": 0.0676, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8166065216064453, + "rewards/margins": 6.026690483093262, + "rewards/rejected": -4.210083961486816, "step": 1990 }, { - "epoch": 2.07, - "learning_rate": 1.7298124760811328e-07, - "logits/chosen": -2.8488190174102783, - "logits/rejected": -2.827085256576538, - "logps/chosen": -349.63287353515625, - "logps/rejected": -342.38812255859375, - "loss": 0.0242, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.6490751504898071, - "rewards/margins": 7.272631645202637, - "rewards/rejected": -6.623556613922119, + "epoch": 1.03, + "learning_rate": 3.6441002103652707e-07, + "logits/chosen": -2.784984588623047, + "logits/rejected": -2.8118598461151123, + "logps/chosen": -235.13330078125, + "logps/rejected": -352.83721923828125, + "loss": 0.0921, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2493923902511597, + "rewards/margins": 6.322751045227051, + "rewards/rejected": -5.073358535766602, "step": 2000 }, { - "epoch": 2.07, - "eval_logits/chosen": -2.860494375228882, - "eval_logits/rejected": -2.8475303649902344, - "eval_logps/chosen": -365.9564514160156, - "eval_logps/rejected": -326.2118835449219, - "eval_loss": 0.6110661029815674, - "eval_rewards/accuracies": 0.7639999985694885, - "eval_rewards/chosen": -1.3090580701828003, - "eval_rewards/margins": 2.900202989578247, - "eval_rewards/rejected": -4.209260940551758, - "eval_runtime": 499.6212, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 1.03, + "eval_logits/chosen": -2.785792589187622, + "eval_logits/rejected": -2.8519487380981445, + "eval_logps/chosen": -246.1128387451172, + "eval_logps/rejected": -310.61309814453125, + "eval_loss": 0.4446970224380493, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": 0.15673115849494934, + "eval_rewards/margins": 3.604296922683716, + "eval_rewards/rejected": -3.4475655555725098, + "eval_runtime": 278.4785, + "eval_samples_per_second": 7.182, + "eval_steps_per_second": 0.449, "step": 2000 }, { - "epoch": 2.08, - "learning_rate": 1.7106773823191734e-07, - "logits/chosen": -2.896272659301758, - "logits/rejected": -2.8855504989624023, - "logps/chosen": -329.5340270996094, - "logps/rejected": -330.1529846191406, - "loss": 0.0161, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 1.1853575706481934, - "rewards/margins": 7.9635748863220215, - "rewards/rejected": -6.778217315673828, + "epoch": 1.04, + "learning_rate": 3.634538152610442e-07, + "logits/chosen": -2.8091163635253906, + "logits/rejected": -2.867849826812744, + "logps/chosen": -263.2342224121094, + "logps/rejected": -345.95953369140625, + "loss": 0.0674, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6022307872772217, + "rewards/margins": 6.758018493652344, + "rewards/rejected": -5.155787467956543, "step": 2010 }, { - "epoch": 2.09, - "learning_rate": 1.691542288557214e-07, - "logits/chosen": -2.8596339225769043, - "logits/rejected": -2.8543241024017334, - "logps/chosen": -334.30126953125, - "logps/rejected": -366.9293518066406, - "loss": 0.013, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7636312246322632, - "rewards/margins": 8.40580940246582, - "rewards/rejected": -7.642178535461426, + "epoch": 1.04, + "learning_rate": 3.624976094855613e-07, + "logits/chosen": -2.7523140907287598, + "logits/rejected": -2.8147144317626953, + "logps/chosen": -248.58163452148438, + "logps/rejected": -312.04058837890625, + "loss": 0.0759, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.417191743850708, + "rewards/margins": 6.807399749755859, + "rewards/rejected": -5.3902082443237305, "step": 2020 }, { - "epoch": 2.1, - "learning_rate": 1.6724071947952544e-07, - "logits/chosen": -2.858603000640869, - "logits/rejected": -2.8608195781707764, - "logps/chosen": -308.79205322265625, - "logps/rejected": -325.49432373046875, - "loss": 0.0295, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.8938325047492981, - "rewards/margins": 7.766848087310791, - "rewards/rejected": -6.873016357421875, + "epoch": 1.05, + "learning_rate": 3.615414037100784e-07, + "logits/chosen": -2.6939048767089844, + "logits/rejected": -2.765730142593384, + "logps/chosen": -207.52597045898438, + "logps/rejected": -296.8516540527344, + "loss": 0.1055, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1127474308013916, + "rewards/margins": 6.433894157409668, + "rewards/rejected": -5.3211469650268555, "step": 2030 }, { - "epoch": 2.11, - "learning_rate": 1.653272101033295e-07, - "logits/chosen": -2.877743721008301, - "logits/rejected": -2.8870015144348145, - "logps/chosen": -360.132080078125, - "logps/rejected": -332.0694580078125, - "loss": 0.0139, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.8420451283454895, - "rewards/margins": 7.829560279846191, - "rewards/rejected": -6.987515449523926, + "epoch": 1.05, + "learning_rate": 3.6058519793459553e-07, + "logits/chosen": -2.743504047393799, + "logits/rejected": -2.8185174465179443, + "logps/chosen": -245.7578125, + "logps/rejected": -285.220703125, + "loss": 0.0985, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.0196592807769775, + "rewards/margins": 6.537779331207275, + "rewards/rejected": -4.518120288848877, "step": 2040 }, { - "epoch": 2.12, - "learning_rate": 1.6341370072713356e-07, - "logits/chosen": -2.908205509185791, - "logits/rejected": -2.8962578773498535, - "logps/chosen": -363.4471130371094, - "logps/rejected": -384.8677978515625, - "loss": 0.0182, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.9818631410598755, - "rewards/margins": 7.976204872131348, - "rewards/rejected": -6.9943413734436035, + "epoch": 1.06, + "learning_rate": 3.5962899215911265e-07, + "logits/chosen": -2.7714102268218994, + "logits/rejected": -2.8159098625183105, + "logps/chosen": -169.52871704101562, + "logps/rejected": -333.046630859375, + "loss": 0.0756, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3516714572906494, + "rewards/margins": 5.515862941741943, + "rewards/rejected": -4.164191246032715, "step": 2050 }, { - "epoch": 2.13, - "learning_rate": 1.6150019135093762e-07, - "logits/chosen": -2.9130074977874756, - "logits/rejected": -2.867249011993408, - "logps/chosen": -394.2611999511719, - "logps/rejected": -368.763427734375, - "loss": 0.0138, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1961416006088257, - "rewards/margins": 8.216410636901855, - "rewards/rejected": -7.020269870758057, + "epoch": 1.06, + "learning_rate": 3.5867278638362976e-07, + "logits/chosen": -2.7787039279937744, + "logits/rejected": -2.8366940021514893, + "logps/chosen": -302.2014465332031, + "logps/rejected": -352.3808898925781, + "loss": 0.0629, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.691415786743164, + "rewards/margins": 7.102464199066162, + "rewards/rejected": -5.411048889160156, "step": 2060 }, { - "epoch": 2.14, - "learning_rate": 1.5958668197474169e-07, - "logits/chosen": -2.9357872009277344, - "logits/rejected": -2.9347925186157227, - "logps/chosen": -328.8543701171875, - "logps/rejected": -369.68292236328125, - "loss": 0.0166, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9537331461906433, - "rewards/margins": 8.309990882873535, - "rewards/rejected": -7.356257438659668, + "epoch": 1.07, + "learning_rate": 3.577165806081469e-07, + "logits/chosen": -2.8112542629241943, + "logits/rejected": -2.8483777046203613, + "logps/chosen": -250.97900390625, + "logps/rejected": -293.00360107421875, + "loss": 0.097, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0092226266860962, + "rewards/margins": 5.730241298675537, + "rewards/rejected": -4.7210187911987305, "step": 2070 }, { - "epoch": 2.15, - "learning_rate": 1.5767317259854572e-07, - "logits/chosen": -2.928189754486084, - "logits/rejected": -2.8961434364318848, - "logps/chosen": -307.6603088378906, - "logps/rejected": -294.9259338378906, - "loss": 0.0136, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.532218873500824, - "rewards/margins": 7.480643272399902, - "rewards/rejected": -6.948424339294434, + "epoch": 1.07, + "learning_rate": 3.56760374832664e-07, + "logits/chosen": -2.828165292739868, + "logits/rejected": -2.830998420715332, + "logps/chosen": -285.7459411621094, + "logps/rejected": -356.6951599121094, + "loss": 0.0637, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7443921566009521, + "rewards/margins": 7.199429988861084, + "rewards/rejected": -5.455037593841553, "step": 2080 }, { - "epoch": 2.16, - "learning_rate": 1.5575966322234978e-07, - "logits/chosen": -2.8677449226379395, - "logits/rejected": -2.8330647945404053, - "logps/chosen": -331.5555725097656, - "logps/rejected": -366.13372802734375, - "loss": 0.0149, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.19210126996040344, - "rewards/margins": 7.909059047698975, - "rewards/rejected": -7.716958522796631, + "epoch": 1.08, + "learning_rate": 3.5580416905718106e-07, + "logits/chosen": -2.7509326934814453, + "logits/rejected": -2.816725492477417, + "logps/chosen": -279.53314208984375, + "logps/rejected": -309.99639892578125, + "loss": 0.0915, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6722646951675415, + "rewards/margins": 7.914118766784668, + "rewards/rejected": -6.241853713989258, "step": 2090 }, { - "epoch": 2.17, - "learning_rate": 1.5384615384615385e-07, - "logits/chosen": -2.8884618282318115, - "logits/rejected": -2.8951830863952637, - "logps/chosen": -340.19757080078125, - "logps/rejected": -328.8606872558594, - "loss": 0.017, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4257478713989258, - "rewards/margins": 7.282183647155762, - "rewards/rejected": -6.856435298919678, + "epoch": 1.08, + "learning_rate": 3.5484796328169817e-07, + "logits/chosen": -2.7287991046905518, + "logits/rejected": -2.78497052192688, + "logps/chosen": -244.94091796875, + "logps/rejected": -279.9932861328125, + "loss": 0.0776, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1437629461288452, + "rewards/margins": 7.133315086364746, + "rewards/rejected": -5.9895524978637695, "step": 2100 }, { - "epoch": 2.17, - "eval_logits/chosen": -2.830853223800659, - "eval_logits/rejected": -2.8138229846954346, - "eval_logps/chosen": -369.8929748535156, - "eval_logps/rejected": -333.4358215332031, - "eval_loss": 0.6473292112350464, - "eval_rewards/accuracies": 0.7620000243186951, - "eval_rewards/chosen": -1.7027121782302856, - "eval_rewards/margins": 3.2289414405822754, - "eval_rewards/rejected": -4.93165397644043, - "eval_runtime": 499.8882, - "eval_samples_per_second": 4.001, - "eval_steps_per_second": 0.5, + "epoch": 1.08, + "eval_logits/chosen": -2.7763452529907227, + "eval_logits/rejected": -2.841165542602539, + "eval_logps/chosen": -246.7716522216797, + "eval_logps/rejected": -315.55926513671875, + "eval_loss": 0.47759073972702026, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": 0.09085023403167725, + "eval_rewards/margins": 4.033032417297363, + "eval_rewards/rejected": -3.9421823024749756, + "eval_runtime": 278.7756, + "eval_samples_per_second": 7.174, + "eval_steps_per_second": 0.448, "step": 2100 }, { - "epoch": 2.18, - "learning_rate": 1.519326444699579e-07, - "logits/chosen": -2.8692545890808105, - "logits/rejected": -2.8672494888305664, - "logps/chosen": -314.2371520996094, - "logps/rejected": -342.84564208984375, - "loss": 0.0121, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4992009103298187, - "rewards/margins": 8.060718536376953, - "rewards/rejected": -7.56151819229126, + "epoch": 1.09, + "learning_rate": 3.538917575062153e-07, + "logits/chosen": -2.7345612049102783, + "logits/rejected": -2.7558319568634033, + "logps/chosen": -287.73345947265625, + "logps/rejected": -363.9012145996094, + "loss": 0.0926, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3527368307113647, + "rewards/margins": 7.984021186828613, + "rewards/rejected": -6.631285190582275, "step": 2110 }, { - "epoch": 2.19, - "learning_rate": 1.5001913509376197e-07, - "logits/chosen": -2.8434972763061523, - "logits/rejected": -2.8411061763763428, - "logps/chosen": -338.0544738769531, - "logps/rejected": -347.9879455566406, - "loss": 0.018, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.41134971380233765, - "rewards/margins": 7.6756792068481445, - "rewards/rejected": -7.264329433441162, + "epoch": 1.09, + "learning_rate": 3.529355517307324e-07, + "logits/chosen": -2.761762857437134, + "logits/rejected": -2.779594898223877, + "logps/chosen": -236.8489990234375, + "logps/rejected": -340.5326843261719, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2858283519744873, + "rewards/margins": 7.696159362792969, + "rewards/rejected": -6.410330772399902, "step": 2120 }, { - "epoch": 2.2, - "learning_rate": 1.4810562571756603e-07, - "logits/chosen": -2.9009549617767334, - "logits/rejected": -2.9190447330474854, - "logps/chosen": -303.60821533203125, - "logps/rejected": -355.43682861328125, - "loss": 0.0144, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.2904825210571289, - "rewards/margins": 8.14781379699707, - "rewards/rejected": -7.857331275939941, + "epoch": 1.1, + "learning_rate": 3.519793459552495e-07, + "logits/chosen": -2.776559352874756, + "logits/rejected": -2.8439061641693115, + "logps/chosen": -260.61431884765625, + "logps/rejected": -348.29681396484375, + "loss": 0.1065, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.6920270919799805, + "rewards/margins": 6.389278888702393, + "rewards/rejected": -5.697251796722412, "step": 2130 }, { - "epoch": 2.21, - "learning_rate": 1.4619211634137007e-07, - "logits/chosen": -2.875998020172119, - "logits/rejected": -2.8702502250671387, - "logps/chosen": -329.28955078125, - "logps/rejected": -334.1612548828125, - "loss": 0.0208, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.40643930435180664, - "rewards/margins": 7.531008243560791, - "rewards/rejected": -7.124569892883301, + "epoch": 1.1, + "learning_rate": 3.510231401797667e-07, + "logits/chosen": -2.8243560791015625, + "logits/rejected": -2.829003095626831, + "logps/chosen": -294.7447204589844, + "logps/rejected": -368.647216796875, + "loss": 0.0906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7448446750640869, + "rewards/margins": 8.040909767150879, + "rewards/rejected": -7.296065330505371, "step": 2140 }, { - "epoch": 2.22, - "learning_rate": 1.4427860696517413e-07, - "logits/chosen": -2.814622402191162, - "logits/rejected": -2.8140344619750977, - "logps/chosen": -369.78466796875, - "logps/rejected": -388.4234313964844, - "loss": 0.0156, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.7858397960662842, - "rewards/margins": 9.021352767944336, - "rewards/rejected": -8.235512733459473, + "epoch": 1.11, + "learning_rate": 3.500669344042838e-07, + "logits/chosen": -2.7932868003845215, + "logits/rejected": -2.8034961223602295, + "logps/chosen": -227.6187744140625, + "logps/rejected": -343.4903869628906, + "loss": 0.0865, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5658053159713745, + "rewards/margins": 7.489978790283203, + "rewards/rejected": -6.924172878265381, "step": 2150 }, { - "epoch": 2.23, - "learning_rate": 1.423650975889782e-07, - "logits/chosen": -2.8845508098602295, - "logits/rejected": -2.862572431564331, - "logps/chosen": -364.07391357421875, - "logps/rejected": -366.08074951171875, - "loss": 0.0181, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.4860958456993103, - "rewards/margins": 8.600242614746094, - "rewards/rejected": -8.11414623260498, + "epoch": 1.12, + "learning_rate": 3.491107286288009e-07, + "logits/chosen": -2.7819812297821045, + "logits/rejected": -2.839261531829834, + "logps/chosen": -238.858154296875, + "logps/rejected": -349.450927734375, + "loss": 0.0682, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2682182788848877, + "rewards/margins": 7.554483890533447, + "rewards/rejected": -6.286264896392822, "step": 2160 }, { - "epoch": 2.24, - "learning_rate": 1.4045158821278225e-07, - "logits/chosen": -2.8357956409454346, - "logits/rejected": -2.8576788902282715, - "logps/chosen": -309.79833984375, - "logps/rejected": -352.6476135253906, - "loss": 0.0191, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2403034269809723, - "rewards/margins": 7.672799110412598, - "rewards/rejected": -7.432496070861816, + "epoch": 1.12, + "learning_rate": 3.4815452285331803e-07, + "logits/chosen": -2.733638048171997, + "logits/rejected": -2.7781291007995605, + "logps/chosen": -301.14105224609375, + "logps/rejected": -330.53912353515625, + "loss": 0.0731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8851715326309204, + "rewards/margins": 6.45550537109375, + "rewards/rejected": -5.570334434509277, "step": 2170 }, { - "epoch": 2.25, - "learning_rate": 1.3853807883658632e-07, - "logits/chosen": -2.843113660812378, - "logits/rejected": -2.8533434867858887, - "logps/chosen": -378.12005615234375, - "logps/rejected": -378.74713134765625, - "loss": 0.0168, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.9185358881950378, - "rewards/margins": 8.218305587768555, - "rewards/rejected": -7.299769401550293, + "epoch": 1.13, + "learning_rate": 3.4719831707783515e-07, + "logits/chosen": -2.7247109413146973, + "logits/rejected": -2.740576982498169, + "logps/chosen": -304.32928466796875, + "logps/rejected": -386.80609130859375, + "loss": 0.0775, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8642752766609192, + "rewards/margins": 7.015794277191162, + "rewards/rejected": -6.151518821716309, "step": 2180 }, { - "epoch": 2.26, - "learning_rate": 1.3662456946039035e-07, - "logits/chosen": -2.8196911811828613, - "logits/rejected": -2.8274574279785156, - "logps/chosen": -305.7922668457031, - "logps/rejected": -350.10772705078125, - "loss": 0.0117, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.43905988335609436, - "rewards/margins": 8.206399917602539, - "rewards/rejected": -7.76733922958374, + "epoch": 1.13, + "learning_rate": 3.4624211130235227e-07, + "logits/chosen": -2.798527956008911, + "logits/rejected": -2.8238823413848877, + "logps/chosen": -205.30416870117188, + "logps/rejected": -305.39337158203125, + "loss": 0.0775, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6926091909408569, + "rewards/margins": 7.436480522155762, + "rewards/rejected": -6.743871212005615, "step": 2190 }, { - "epoch": 2.27, - "learning_rate": 1.3471106008419441e-07, - "logits/chosen": -2.8260867595672607, - "logits/rejected": -2.8560781478881836, - "logps/chosen": -319.553955078125, - "logps/rejected": -375.03985595703125, - "loss": 0.0153, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.368704617023468, - "rewards/margins": 8.854362487792969, - "rewards/rejected": -8.485657691955566, + "epoch": 1.14, + "learning_rate": 3.452859055268694e-07, + "logits/chosen": -2.7647600173950195, + "logits/rejected": -2.799379348754883, + "logps/chosen": -225.5302276611328, + "logps/rejected": -308.9101867675781, + "loss": 0.0679, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8118773698806763, + "rewards/margins": 7.733236789703369, + "rewards/rejected": -6.921359062194824, "step": 2200 }, { - "epoch": 2.27, - "eval_logits/chosen": -2.80574369430542, - "eval_logits/rejected": -2.7871527671813965, - "eval_logps/chosen": -372.91436767578125, - "eval_logps/rejected": -338.00445556640625, - "eval_loss": 0.6657643914222717, - "eval_rewards/accuracies": 0.7699999809265137, - "eval_rewards/chosen": -2.004854440689087, - "eval_rewards/margins": 3.383664846420288, - "eval_rewards/rejected": -5.388518810272217, - "eval_runtime": 499.0606, - "eval_samples_per_second": 4.008, - "eval_steps_per_second": 0.501, + "epoch": 1.14, + "eval_logits/chosen": -2.744640588760376, + "eval_logits/rejected": -2.808490037918091, + "eval_logps/chosen": -254.41099548339844, + "eval_logps/rejected": -324.3449401855469, + "eval_loss": 0.4769650995731354, + "eval_rewards/accuracies": 0.8240000009536743, + "eval_rewards/chosen": -0.6730862855911255, + "eval_rewards/margins": 4.147665977478027, + "eval_rewards/rejected": -4.820752143859863, + "eval_runtime": 278.654, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.449, "step": 2200 }, { - "epoch": 2.28, - "learning_rate": 1.3279755070799848e-07, - "logits/chosen": -2.8646817207336426, - "logits/rejected": -2.8080735206604004, - "logps/chosen": -317.29412841796875, - "logps/rejected": -357.2666931152344, - "loss": 0.0234, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.43733105063438416, - "rewards/margins": 8.163047790527344, - "rewards/rejected": -7.725717067718506, + "epoch": 1.14, + "learning_rate": 3.443296997513865e-07, + "logits/chosen": -2.804619550704956, + "logits/rejected": -2.8267204761505127, + "logps/chosen": -268.3411560058594, + "logps/rejected": -381.73199462890625, + "loss": 0.0954, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9453462362289429, + "rewards/margins": 7.497371673583984, + "rewards/rejected": -6.55202579498291, "step": 2210 }, { - "epoch": 2.29, - "learning_rate": 1.3088404133180254e-07, - "logits/chosen": -2.8100650310516357, - "logits/rejected": -2.7986183166503906, - "logps/chosen": -377.2625427246094, - "logps/rejected": -359.5750427246094, - "loss": 0.0141, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7715356349945068, - "rewards/margins": 8.111381530761719, - "rewards/rejected": -7.339845180511475, + "epoch": 1.15, + "learning_rate": 3.433734939759036e-07, + "logits/chosen": -2.7686784267425537, + "logits/rejected": -2.811330556869507, + "logps/chosen": -259.18743896484375, + "logps/rejected": -403.6307373046875, + "loss": 0.0815, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2846524715423584, + "rewards/margins": 9.648810386657715, + "rewards/rejected": -8.36415958404541, "step": 2220 }, { - "epoch": 2.3, - "learning_rate": 1.289705319556066e-07, - "logits/chosen": -2.802110195159912, - "logits/rejected": -2.7565503120422363, - "logps/chosen": -313.4336853027344, - "logps/rejected": -351.89703369140625, - "loss": 0.0147, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.36583349108695984, - "rewards/margins": 8.023749351501465, - "rewards/rejected": -8.389582633972168, + "epoch": 1.15, + "learning_rate": 3.4241728820042073e-07, + "logits/chosen": -2.731107234954834, + "logits/rejected": -2.77121639251709, + "logps/chosen": -223.3089599609375, + "logps/rejected": -357.7713928222656, + "loss": 0.0832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13498951494693756, + "rewards/margins": 6.8718414306640625, + "rewards/rejected": -6.736852169036865, "step": 2230 }, { - "epoch": 2.31, - "learning_rate": 1.2705702257941064e-07, - "logits/chosen": -2.841660499572754, - "logits/rejected": -2.8044819831848145, - "logps/chosen": -352.5540771484375, - "logps/rejected": -346.982666015625, - "loss": 0.011, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3794718086719513, - "rewards/margins": 8.321603775024414, - "rewards/rejected": -7.942131996154785, + "epoch": 1.16, + "learning_rate": 3.4146108242493784e-07, + "logits/chosen": -2.811983823776245, + "logits/rejected": -2.864361524581909, + "logps/chosen": -215.65737915039062, + "logps/rejected": -333.7270812988281, + "loss": 0.0782, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2699239253997803, + "rewards/margins": 7.710413932800293, + "rewards/rejected": -6.440489768981934, "step": 2240 }, { - "epoch": 2.32, - "learning_rate": 1.251435132032147e-07, - "logits/chosen": -2.850743532180786, - "logits/rejected": -2.8343441486358643, - "logps/chosen": -339.0596008300781, - "logps/rejected": -389.5365905761719, - "loss": 0.0971, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.7595340013504028, - "rewards/margins": 8.639189720153809, - "rewards/rejected": -7.8796563148498535, + "epoch": 1.16, + "learning_rate": 3.405048766494549e-07, + "logits/chosen": -2.812655210494995, + "logits/rejected": -2.8382606506347656, + "logps/chosen": -280.080078125, + "logps/rejected": -329.3871154785156, + "loss": 0.0863, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7157481908798218, + "rewards/margins": 7.961747646331787, + "rewards/rejected": -6.245999336242676, "step": 2250 }, { - "epoch": 2.33, - "learning_rate": 1.2323000382701873e-07, - "logits/chosen": -2.8145759105682373, - "logits/rejected": -2.8445546627044678, - "logps/chosen": -345.4944152832031, - "logps/rejected": -367.5064392089844, - "loss": 0.0459, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.18377834558486938, - "rewards/margins": 8.928794860839844, - "rewards/rejected": -8.745016098022461, + "epoch": 1.17, + "learning_rate": 3.39548670873972e-07, + "logits/chosen": -2.803392171859741, + "logits/rejected": -2.8344738483428955, + "logps/chosen": -296.22540283203125, + "logps/rejected": -401.93194580078125, + "loss": 0.1221, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1003711223602295, + "rewards/margins": 9.793557167053223, + "rewards/rejected": -7.693185329437256, "step": 2260 }, { - "epoch": 2.34, - "learning_rate": 1.213164944508228e-07, - "logits/chosen": -2.867486000061035, - "logits/rejected": -2.849432945251465, - "logps/chosen": -380.73876953125, - "logps/rejected": -375.8220520019531, - "loss": 0.0109, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.4815305173397064, - "rewards/margins": 8.172357559204102, - "rewards/rejected": -7.690826416015625, + "epoch": 1.17, + "learning_rate": 3.3859246509848914e-07, + "logits/chosen": -2.7670085430145264, + "logits/rejected": -2.8575000762939453, + "logps/chosen": -259.09197998046875, + "logps/rejected": -358.9950256347656, + "loss": 0.0765, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5088821649551392, + "rewards/margins": 7.601584434509277, + "rewards/rejected": -6.0927019119262695, "step": 2270 }, { - "epoch": 2.35, - "learning_rate": 1.1940298507462686e-07, - "logits/chosen": -2.882193088531494, - "logits/rejected": -2.8630523681640625, - "logps/chosen": -309.0014953613281, - "logps/rejected": -321.5120849609375, - "loss": 0.0166, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.2122192680835724, - "rewards/margins": 7.754061698913574, - "rewards/rejected": -7.966280937194824, + "epoch": 1.18, + "learning_rate": 3.376362593230063e-07, + "logits/chosen": -2.8007359504699707, + "logits/rejected": -2.861616611480713, + "logps/chosen": -227.0973663330078, + "logps/rejected": -363.10223388671875, + "loss": 0.076, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5983880758285522, + "rewards/margins": 9.013298034667969, + "rewards/rejected": -7.414910316467285, "step": 2280 }, { - "epoch": 2.37, - "learning_rate": 1.1748947569843092e-07, - "logits/chosen": -2.8814282417297363, - "logits/rejected": -2.8726353645324707, - "logps/chosen": -348.33294677734375, - "logps/rejected": -351.8730163574219, - "loss": 0.0209, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.46276918053627014, - "rewards/margins": 8.149224281311035, - "rewards/rejected": -7.686454772949219, + "epoch": 1.18, + "learning_rate": 3.366800535475234e-07, + "logits/chosen": -2.807025909423828, + "logits/rejected": -2.799335241317749, + "logps/chosen": -268.4599914550781, + "logps/rejected": -343.4675598144531, + "loss": 0.0852, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4412739276885986, + "rewards/margins": 7.5395050048828125, + "rewards/rejected": -6.098231315612793, "step": 2290 }, { - "epoch": 2.38, - "learning_rate": 1.1557596632223497e-07, - "logits/chosen": -2.839801073074341, - "logits/rejected": -2.853092670440674, - "logps/chosen": -348.4852600097656, - "logps/rejected": -325.7494201660156, - "loss": 0.0215, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5979126691818237, - "rewards/margins": 7.9850335121154785, - "rewards/rejected": -7.387121677398682, + "epoch": 1.19, + "learning_rate": 3.3572384777204054e-07, + "logits/chosen": -2.8694748878479004, + "logits/rejected": -2.8965165615081787, + "logps/chosen": -247.4459686279297, + "logps/rejected": -319.42218017578125, + "loss": 0.0696, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2710659503936768, + "rewards/margins": 7.4833526611328125, + "rewards/rejected": -6.212286949157715, "step": 2300 }, { - "epoch": 2.38, - "eval_logits/chosen": -2.840635299682617, - "eval_logits/rejected": -2.82096266746521, - "eval_logps/chosen": -370.4217529296875, - "eval_logps/rejected": -336.2837219238281, - "eval_loss": 0.6721770167350769, - "eval_rewards/accuracies": 0.777999997138977, - "eval_rewards/chosen": -1.7555896043777466, - "eval_rewards/margins": 3.46085524559021, - "eval_rewards/rejected": -5.216445446014404, - "eval_runtime": 499.1898, - "eval_samples_per_second": 4.006, - "eval_steps_per_second": 0.501, + "epoch": 1.19, + "eval_logits/chosen": -2.8013522624969482, + "eval_logits/rejected": -2.8622031211853027, + "eval_logps/chosen": -247.92803955078125, + "eval_logps/rejected": -317.9334411621094, + "eval_loss": 0.4886242747306824, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -0.024788517504930496, + "eval_rewards/margins": 4.154811382293701, + "eval_rewards/rejected": -4.179599761962891, + "eval_runtime": 278.9579, + "eval_samples_per_second": 7.17, + "eval_steps_per_second": 0.448, "step": 2300 }, { - "epoch": 2.39, - "learning_rate": 1.1366245694603903e-07, - "logits/chosen": -2.863748550415039, - "logits/rejected": -2.858935594558716, - "logps/chosen": -323.34002685546875, - "logps/rejected": -339.47552490234375, - "loss": 0.0189, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.011857276782393456, - "rewards/margins": 7.8068037033081055, - "rewards/rejected": -7.818660736083984, + "epoch": 1.19, + "learning_rate": 3.3476764199655765e-07, + "logits/chosen": -2.802063465118408, + "logits/rejected": -2.82922625541687, + "logps/chosen": -216.39633178710938, + "logps/rejected": -306.6519470214844, + "loss": 0.0655, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3033801317214966, + "rewards/margins": 7.051173210144043, + "rewards/rejected": -5.747794151306152, "step": 2310 }, { - "epoch": 2.4, - "learning_rate": 1.1174894756984308e-07, - "logits/chosen": -2.8732120990753174, - "logits/rejected": -2.865734577178955, - "logps/chosen": -312.9558410644531, - "logps/rejected": -335.70782470703125, - "loss": 0.0157, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.2408231794834137, - "rewards/margins": 8.007491111755371, - "rewards/rejected": -7.76666784286499, + "epoch": 1.2, + "learning_rate": 3.3381143622107477e-07, + "logits/chosen": -2.7702291011810303, + "logits/rejected": -2.840040683746338, + "logps/chosen": -283.4988708496094, + "logps/rejected": -366.2091369628906, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2266050577163696, + "rewards/margins": 8.91787338256836, + "rewards/rejected": -7.691267967224121, "step": 2320 }, { - "epoch": 2.41, - "learning_rate": 1.0983543819364714e-07, - "logits/chosen": -2.8552298545837402, - "logits/rejected": -2.8243603706359863, - "logps/chosen": -353.9219665527344, - "logps/rejected": -354.1855163574219, - "loss": 0.0154, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.15999922156333923, - "rewards/margins": 8.37447738647461, - "rewards/rejected": -8.2144775390625, + "epoch": 1.2, + "learning_rate": 3.328552304455919e-07, + "logits/chosen": -2.8088457584381104, + "logits/rejected": -2.8401389122009277, + "logps/chosen": -233.0177001953125, + "logps/rejected": -323.83734130859375, + "loss": 0.076, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7884088754653931, + "rewards/margins": 6.651333808898926, + "rewards/rejected": -5.862925052642822, "step": 2330 }, { - "epoch": 2.42, - "learning_rate": 1.079219288174512e-07, - "logits/chosen": -2.831439971923828, - "logits/rejected": -2.8317112922668457, - "logps/chosen": -287.0343017578125, - "logps/rejected": -336.79254150390625, - "loss": 0.0173, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.47535672783851624, - "rewards/margins": 8.473257064819336, - "rewards/rejected": -7.997899055480957, + "epoch": 1.21, + "learning_rate": 3.31899024670109e-07, + "logits/chosen": -2.820403575897217, + "logits/rejected": -2.8067612648010254, + "logps/chosen": -228.03482055664062, + "logps/rejected": -342.9540710449219, + "loss": 0.0836, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4116134643554688, + "rewards/margins": 7.760331630706787, + "rewards/rejected": -6.34871768951416, "step": 2340 }, { - "epoch": 2.43, - "learning_rate": 1.0600841944125525e-07, - "logits/chosen": -2.8530073165893555, - "logits/rejected": -2.8641648292541504, - "logps/chosen": -333.83526611328125, - "logps/rejected": -347.7726135253906, - "loss": 0.0181, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.020889759063720703, - "rewards/margins": 8.293529510498047, - "rewards/rejected": -8.31441879272461, + "epoch": 1.21, + "learning_rate": 3.309428188946261e-07, + "logits/chosen": -2.7867391109466553, + "logits/rejected": -2.8338825702667236, + "logps/chosen": -221.8268280029297, + "logps/rejected": -323.56317138671875, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8608427047729492, + "rewards/margins": 7.180264472961426, + "rewards/rejected": -6.319421768188477, "step": 2350 }, { - "epoch": 2.44, - "learning_rate": 1.0409491006505931e-07, - "logits/chosen": -2.828545331954956, - "logits/rejected": -2.837430238723755, - "logps/chosen": -356.1022033691406, - "logps/rejected": -366.8138732910156, - "loss": 0.0431, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.3571850657463074, - "rewards/margins": 8.733023643493652, - "rewards/rejected": -8.375839233398438, + "epoch": 1.22, + "learning_rate": 3.2998661311914323e-07, + "logits/chosen": -2.7895936965942383, + "logits/rejected": -2.8425674438476562, + "logps/chosen": -241.6412811279297, + "logps/rejected": -352.6873779296875, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3408452272415161, + "rewards/margins": 8.829115867614746, + "rewards/rejected": -7.488271236419678, "step": 2360 }, { - "epoch": 2.45, - "learning_rate": 1.0218140068886336e-07, - "logits/chosen": -2.76576566696167, - "logits/rejected": -2.765660285949707, - "logps/chosen": -331.6526794433594, - "logps/rejected": -355.6332092285156, - "loss": 0.0163, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.01460124272853136, - "rewards/margins": 7.919270992279053, - "rewards/rejected": -7.933871269226074, + "epoch": 1.22, + "learning_rate": 3.2903040734366035e-07, + "logits/chosen": -2.835283041000366, + "logits/rejected": -2.8766496181488037, + "logps/chosen": -222.8701171875, + "logps/rejected": -350.6272888183594, + "loss": 0.0624, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3985366821289062, + "rewards/margins": 8.540349006652832, + "rewards/rejected": -7.141812324523926, "step": 2370 }, { - "epoch": 2.46, - "learning_rate": 1.0026789131266743e-07, - "logits/chosen": -2.7807886600494385, - "logits/rejected": -2.781130075454712, - "logps/chosen": -338.2047424316406, - "logps/rejected": -384.52490234375, - "loss": 0.0147, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.1773863583803177, - "rewards/margins": 9.542524337768555, - "rewards/rejected": -9.365138053894043, + "epoch": 1.23, + "learning_rate": 3.2807420156817746e-07, + "logits/chosen": -2.733876943588257, + "logits/rejected": -2.7979674339294434, + "logps/chosen": -194.2627410888672, + "logps/rejected": -343.9058532714844, + "loss": 0.0898, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2534451484680176, + "rewards/margins": 7.484607696533203, + "rewards/rejected": -6.2311625480651855, "step": 2380 }, { - "epoch": 2.47, - "learning_rate": 9.835438193647149e-08, - "logits/chosen": -2.7662460803985596, - "logits/rejected": -2.75819730758667, - "logps/chosen": -353.21697998046875, - "logps/rejected": -345.8397216796875, - "loss": 0.0181, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.11677594482898712, - "rewards/margins": 8.12705135345459, - "rewards/rejected": -8.243826866149902, + "epoch": 1.23, + "learning_rate": 3.271179957926946e-07, + "logits/chosen": -2.8532767295837402, + "logits/rejected": -2.884650707244873, + "logps/chosen": -263.845458984375, + "logps/rejected": -340.37127685546875, + "loss": 0.0768, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.687608540058136, + "rewards/margins": 7.081732273101807, + "rewards/rejected": -6.394123077392578, "step": 2390 }, { - "epoch": 2.48, - "learning_rate": 9.644087256027554e-08, - "logits/chosen": -2.786858320236206, - "logits/rejected": -2.7591071128845215, - "logps/chosen": -313.44757080078125, - "logps/rejected": -352.6889953613281, - "loss": 0.0128, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.04645581170916557, - "rewards/margins": 9.099355697631836, - "rewards/rejected": -9.14581298828125, + "epoch": 1.24, + "learning_rate": 3.261617900172117e-07, + "logits/chosen": -2.8089540004730225, + "logits/rejected": -2.8392434120178223, + "logps/chosen": -250.58303833007812, + "logps/rejected": -347.5687561035156, + "loss": 0.1026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9663568735122681, + "rewards/margins": 6.710597991943359, + "rewards/rejected": -5.744241714477539, "step": 2400 }, { - "epoch": 2.48, - "eval_logits/chosen": -2.8010993003845215, - "eval_logits/rejected": -2.77828049659729, - "eval_logps/chosen": -373.6317443847656, - "eval_logps/rejected": -340.2928161621094, - "eval_loss": 0.6771968007087708, - "eval_rewards/accuracies": 0.7699999809265137, - "eval_rewards/chosen": -2.0765931606292725, - "eval_rewards/margins": 3.540759563446045, - "eval_rewards/rejected": -5.617353439331055, - "eval_runtime": 498.7331, - "eval_samples_per_second": 4.01, - "eval_steps_per_second": 0.501, + "epoch": 1.24, + "eval_logits/chosen": -2.8102970123291016, + "eval_logits/rejected": -2.8701605796813965, + "eval_logps/chosen": -246.5922393798828, + "eval_logps/rejected": -315.093994140625, + "eval_loss": 0.48624616861343384, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": 0.1087898463010788, + "eval_rewards/margins": 4.004448413848877, + "eval_rewards/rejected": -3.8956587314605713, + "eval_runtime": 278.4354, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.449, "step": 2400 }, { - "epoch": 2.49, - "learning_rate": 9.45273631840796e-08, - "logits/chosen": -2.83256196975708, - "logits/rejected": -2.817204713821411, - "logps/chosen": -316.6646423339844, - "logps/rejected": -365.89801025390625, - "loss": 0.0216, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.18594422936439514, - "rewards/margins": 8.41205883026123, - "rewards/rejected": -8.226114273071289, + "epoch": 1.24, + "learning_rate": 3.2520558424172876e-07, + "logits/chosen": -2.8509275913238525, + "logits/rejected": -2.8675315380096436, + "logps/chosen": -216.6479034423828, + "logps/rejected": -314.75885009765625, + "loss": 0.0799, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7996807098388672, + "rewards/margins": 7.730632781982422, + "rewards/rejected": -5.930952548980713, "step": 2410 }, { - "epoch": 2.5, - "learning_rate": 9.261385380788366e-08, - "logits/chosen": -2.800245523452759, - "logits/rejected": -2.8351614475250244, - "logps/chosen": -310.83319091796875, - "logps/rejected": -358.3487854003906, - "loss": 0.0132, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.3432609736919403, - "rewards/margins": 8.361449241638184, - "rewards/rejected": -8.0181884765625, + "epoch": 1.25, + "learning_rate": 3.242493784662459e-07, + "logits/chosen": -2.8042349815368652, + "logits/rejected": -2.8402154445648193, + "logps/chosen": -223.0919647216797, + "logps/rejected": -324.47357177734375, + "loss": 0.0694, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4728593826293945, + "rewards/margins": 8.073456764221191, + "rewards/rejected": -6.600597381591797, "step": 2420 }, { - "epoch": 2.51, - "learning_rate": 9.070034443168771e-08, - "logits/chosen": -2.827857732772827, - "logits/rejected": -2.817192792892456, - "logps/chosen": -314.1051025390625, - "logps/rejected": -337.80670166015625, - "loss": 0.0122, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.01949559524655342, - "rewards/margins": 8.174915313720703, - "rewards/rejected": -8.19441032409668, + "epoch": 1.25, + "learning_rate": 3.2329317269076304e-07, + "logits/chosen": -2.7997336387634277, + "logits/rejected": -2.8355021476745605, + "logps/chosen": -275.755615234375, + "logps/rejected": -410.38885498046875, + "loss": 0.0997, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0294463634490967, + "rewards/margins": 8.784956932067871, + "rewards/rejected": -7.755509853363037, "step": 2430 }, { - "epoch": 2.52, - "learning_rate": 8.878683505549177e-08, - "logits/chosen": -2.8365912437438965, - "logits/rejected": -2.8096940517425537, - "logps/chosen": -315.1009216308594, - "logps/rejected": -345.9786376953125, - "loss": 0.0168, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.3356740176677704, - "rewards/margins": 8.638749122619629, - "rewards/rejected": -8.303074836730957, + "epoch": 1.26, + "learning_rate": 3.2233696691528016e-07, + "logits/chosen": -2.7436885833740234, + "logits/rejected": -2.8203845024108887, + "logps/chosen": -198.19422912597656, + "logps/rejected": -370.61602783203125, + "loss": 0.1348, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.599268913269043, + "rewards/margins": 7.958105564117432, + "rewards/rejected": -6.3588361740112305, "step": 2440 }, { - "epoch": 2.53, - "learning_rate": 8.687332567929582e-08, - "logits/chosen": -2.7972190380096436, - "logits/rejected": -2.7952070236206055, - "logps/chosen": -331.07855224609375, - "logps/rejected": -361.274658203125, - "loss": 0.0164, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.03445981815457344, - "rewards/margins": 8.762239456176758, - "rewards/rejected": -8.796700477600098, + "epoch": 1.26, + "learning_rate": 3.2138076113979727e-07, + "logits/chosen": -2.7402098178863525, + "logits/rejected": -2.773638963699341, + "logps/chosen": -274.2806396484375, + "logps/rejected": -379.9324035644531, + "loss": 0.0887, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.12480942904949188, + "rewards/margins": 7.0526933670043945, + "rewards/rejected": -6.927884101867676, "step": 2450 }, { - "epoch": 2.54, - "learning_rate": 8.495981630309988e-08, - "logits/chosen": -2.782491683959961, - "logits/rejected": -2.798245906829834, - "logps/chosen": -388.80487060546875, - "logps/rejected": -385.5155944824219, - "loss": 0.0169, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.13862161338329315, - "rewards/margins": 8.304658889770508, - "rewards/rejected": -8.166036605834961, + "epoch": 1.27, + "learning_rate": 3.204245553643144e-07, + "logits/chosen": -2.7169342041015625, + "logits/rejected": -2.7519021034240723, + "logps/chosen": -303.33770751953125, + "logps/rejected": -361.6357421875, + "loss": 0.0583, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9514210820198059, + "rewards/margins": 7.6320390701293945, + "rewards/rejected": -6.680616855621338, "step": 2460 }, { - "epoch": 2.55, - "learning_rate": 8.304630692690395e-08, - "logits/chosen": -2.7862939834594727, - "logits/rejected": -2.800889015197754, - "logps/chosen": -340.80450439453125, - "logps/rejected": -324.3976135253906, - "loss": 0.0209, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.042531587183475494, - "rewards/margins": 9.009973526000977, - "rewards/rejected": -8.967442512512207, + "epoch": 1.28, + "learning_rate": 3.194683495888315e-07, + "logits/chosen": -2.7771124839782715, + "logits/rejected": -2.83833646774292, + "logps/chosen": -237.21743774414062, + "logps/rejected": -352.67486572265625, + "loss": 0.1615, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3097350597381592, + "rewards/margins": 7.885691165924072, + "rewards/rejected": -6.575955867767334, "step": 2470 }, { - "epoch": 2.56, - "learning_rate": 8.1132797550708e-08, - "logits/chosen": -2.860032796859741, - "logits/rejected": -2.8409759998321533, - "logps/chosen": -374.20684814453125, - "logps/rejected": -355.40087890625, - "loss": 0.0088, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.24193501472473145, - "rewards/margins": 8.717119216918945, - "rewards/rejected": -8.475184440612793, + "epoch": 1.28, + "learning_rate": 3.185121438133486e-07, + "logits/chosen": -2.7796790599823, + "logits/rejected": -2.78997540473938, + "logps/chosen": -293.5530700683594, + "logps/rejected": -425.14208984375, + "loss": 0.0732, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4847608804702759, + "rewards/margins": 8.36500358581543, + "rewards/rejected": -6.88024377822876, "step": 2480 }, { - "epoch": 2.57, - "learning_rate": 7.921928817451206e-08, - "logits/chosen": -2.8591599464416504, - "logits/rejected": -2.8410227298736572, - "logps/chosen": -335.44976806640625, - "logps/rejected": -363.7392578125, - "loss": 0.0115, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.2703700363636017, - "rewards/margins": 9.15362548828125, - "rewards/rejected": -8.883255004882812, + "epoch": 1.29, + "learning_rate": 3.1755593803786574e-07, + "logits/chosen": -2.8316915035247803, + "logits/rejected": -2.8410391807556152, + "logps/chosen": -223.18466186523438, + "logps/rejected": -332.941162109375, + "loss": 0.0682, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.1314014196395874, + "rewards/margins": 8.023019790649414, + "rewards/rejected": -6.891618251800537, "step": 2490 }, { - "epoch": 2.58, - "learning_rate": 7.73057787983161e-08, - "logits/chosen": -2.8341917991638184, - "logits/rejected": -2.8292124271392822, - "logps/chosen": -357.0975341796875, - "logps/rejected": -373.5026550292969, - "loss": 0.015, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.19130437076091766, - "rewards/margins": 8.608747482299805, - "rewards/rejected": -8.80005168914795, + "epoch": 1.29, + "learning_rate": 3.1659973226238285e-07, + "logits/chosen": -2.8022923469543457, + "logits/rejected": -2.8136115074157715, + "logps/chosen": -260.37603759765625, + "logps/rejected": -328.55810546875, + "loss": 0.104, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0766308307647705, + "rewards/margins": 7.775477409362793, + "rewards/rejected": -6.69884729385376, "step": 2500 }, { - "epoch": 2.58, - "eval_logits/chosen": -2.7877330780029297, - "eval_logits/rejected": -2.7656519412994385, - "eval_logps/chosen": -375.72076416015625, - "eval_logps/rejected": -343.2975769042969, - "eval_loss": 0.6893309354782104, - "eval_rewards/accuracies": 0.7739999890327454, - "eval_rewards/chosen": -2.285490036010742, - "eval_rewards/margins": 3.6323366165161133, - "eval_rewards/rejected": -5.9178266525268555, - "eval_runtime": 499.4861, - "eval_samples_per_second": 4.004, - "eval_steps_per_second": 0.501, + "epoch": 1.29, + "eval_logits/chosen": -2.7535407543182373, + "eval_logits/rejected": -2.810478448867798, + "eval_logps/chosen": -253.7227783203125, + "eval_logps/rejected": -326.864013671875, + "eval_loss": 0.5141146779060364, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -0.60426265001297, + "eval_rewards/margins": 4.468394756317139, + "eval_rewards/rejected": -5.072656631469727, + "eval_runtime": 279.4079, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.447, "step": 2500 }, { - "epoch": 2.59, - "learning_rate": 7.539226942212017e-08, - "logits/chosen": -2.7619540691375732, - "logits/rejected": -2.774597644805908, - "logps/chosen": -318.77679443359375, - "logps/rejected": -376.66656494140625, - "loss": 0.0153, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.632733941078186, - "rewards/margins": 8.800397872924805, - "rewards/rejected": -9.433133125305176, + "epoch": 1.3, + "learning_rate": 3.1564352648689997e-07, + "logits/chosen": -2.826930522918701, + "logits/rejected": -2.872891902923584, + "logps/chosen": -247.903564453125, + "logps/rejected": -338.798095703125, + "loss": 0.1311, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.7270460724830627, + "rewards/margins": 7.407177925109863, + "rewards/rejected": -6.680130958557129, "step": 2510 }, { - "epoch": 2.6, - "learning_rate": 7.347876004592423e-08, - "logits/chosen": -2.8549551963806152, - "logits/rejected": -2.868101119995117, - "logps/chosen": -300.10797119140625, - "logps/rejected": -321.92755126953125, - "loss": 0.0138, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.15570883452892303, - "rewards/margins": 8.049636840820312, - "rewards/rejected": -8.205345153808594, + "epoch": 1.3, + "learning_rate": 3.146873207114171e-07, + "logits/chosen": -2.749908447265625, + "logits/rejected": -2.7867674827575684, + "logps/chosen": -194.88047790527344, + "logps/rejected": -334.09234619140625, + "loss": 0.0771, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3872716426849365, + "rewards/margins": 8.179627418518066, + "rewards/rejected": -6.792355537414551, "step": 2520 }, { - "epoch": 2.61, - "learning_rate": 7.156525066972828e-08, - "logits/chosen": -2.8064002990722656, - "logits/rejected": -2.805536985397339, - "logps/chosen": -322.34814453125, - "logps/rejected": -366.001220703125, - "loss": 0.0134, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.4449862837791443, - "rewards/margins": 8.548049926757812, - "rewards/rejected": -8.993036270141602, + "epoch": 1.31, + "learning_rate": 3.137311149359342e-07, + "logits/chosen": -2.790775775909424, + "logits/rejected": -2.8377020359039307, + "logps/chosen": -282.1646423339844, + "logps/rejected": -381.095703125, + "loss": 0.091, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0731854438781738, + "rewards/margins": 6.98352575302124, + "rewards/rejected": -5.910340309143066, "step": 2530 }, { - "epoch": 2.62, - "learning_rate": 6.965174129353234e-08, - "logits/chosen": -2.832437515258789, - "logits/rejected": -2.830235004425049, - "logps/chosen": -385.7052307128906, - "logps/rejected": -384.5820617675781, - "loss": 0.0196, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.25569507479667664, - "rewards/margins": 8.444574356079102, - "rewards/rejected": -8.188879013061523, + "epoch": 1.31, + "learning_rate": 3.127749091604513e-07, + "logits/chosen": -2.8378074169158936, + "logits/rejected": -2.8901278972625732, + "logps/chosen": -307.4638977050781, + "logps/rejected": -385.1072998046875, + "loss": 0.074, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6109613180160522, + "rewards/margins": 8.63659381866455, + "rewards/rejected": -7.025631904602051, "step": 2540 }, { - "epoch": 2.63, - "learning_rate": 6.773823191733639e-08, - "logits/chosen": -2.768073320388794, - "logits/rejected": -2.8013052940368652, - "logps/chosen": -321.265625, - "logps/rejected": -381.2618103027344, - "loss": 0.0159, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.49428802728652954, - "rewards/margins": 9.570723533630371, - "rewards/rejected": -10.065011978149414, + "epoch": 1.32, + "learning_rate": 3.1181870338496843e-07, + "logits/chosen": -2.825392007827759, + "logits/rejected": -2.8577558994293213, + "logps/chosen": -219.1394805908203, + "logps/rejected": -337.9822692871094, + "loss": 0.1222, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7719315886497498, + "rewards/margins": 6.5245184898376465, + "rewards/rejected": -5.752586364746094, "step": 2550 }, { - "epoch": 2.64, - "learning_rate": 6.582472254114045e-08, - "logits/chosen": -2.8302340507507324, - "logits/rejected": -2.800615072250366, - "logps/chosen": -331.8475036621094, - "logps/rejected": -366.77777099609375, - "loss": 0.0167, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.04411726072430611, - "rewards/margins": 9.26185131072998, - "rewards/rejected": -9.305968284606934, + "epoch": 1.32, + "learning_rate": 3.108624976094856e-07, + "logits/chosen": -2.869966506958008, + "logits/rejected": -2.9369266033172607, + "logps/chosen": -232.96163940429688, + "logps/rejected": -306.9654235839844, + "loss": 0.1048, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9050451517105103, + "rewards/margins": 7.699380397796631, + "rewards/rejected": -5.794335842132568, "step": 2560 }, { - "epoch": 2.65, - "learning_rate": 6.391121316494451e-08, - "logits/chosen": -2.8136613368988037, - "logits/rejected": -2.8105883598327637, - "logps/chosen": -348.7919006347656, - "logps/rejected": -370.4637145996094, - "loss": 0.013, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.12355004251003265, - "rewards/margins": 8.435611724853516, - "rewards/rejected": -8.559160232543945, + "epoch": 1.33, + "learning_rate": 3.0990629183400266e-07, + "logits/chosen": -2.839395046234131, + "logits/rejected": -2.8635356426239014, + "logps/chosen": -239.1020050048828, + "logps/rejected": -366.38623046875, + "loss": 0.1204, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9102404117584229, + "rewards/margins": 9.183894157409668, + "rewards/rejected": -7.273654937744141, "step": 2570 }, { - "epoch": 2.66, - "learning_rate": 6.199770378874856e-08, - "logits/chosen": -2.8767693042755127, - "logits/rejected": -2.8680922985076904, - "logps/chosen": -328.68975830078125, - "logps/rejected": -375.6171875, - "loss": 0.0135, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.11462025344371796, - "rewards/margins": 8.414809226989746, - "rewards/rejected": -8.52942943572998, + "epoch": 1.33, + "learning_rate": 3.089500860585198e-07, + "logits/chosen": -2.829850912094116, + "logits/rejected": -2.887627363204956, + "logps/chosen": -200.79571533203125, + "logps/rejected": -296.86932373046875, + "loss": 0.1213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3977177143096924, + "rewards/margins": 6.472372531890869, + "rewards/rejected": -6.074654579162598, "step": 2580 }, { - "epoch": 2.67, - "learning_rate": 6.008419441255262e-08, - "logits/chosen": -2.7816498279571533, - "logits/rejected": -2.7839863300323486, - "logps/chosen": -349.9986267089844, - "logps/rejected": -363.5763854980469, - "loss": 0.0155, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08176889270544052, - "rewards/margins": 8.521749496459961, - "rewards/rejected": -8.439981460571289, + "epoch": 1.34, + "learning_rate": 3.079938802830369e-07, + "logits/chosen": -2.911804676055908, + "logits/rejected": -2.9058237075805664, + "logps/chosen": -226.080810546875, + "logps/rejected": -261.0593566894531, + "loss": 0.0687, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5239999294281006, + "rewards/margins": 7.221141815185547, + "rewards/rejected": -5.697141647338867, "step": 2590 }, { - "epoch": 2.69, - "learning_rate": 5.817068503635668e-08, - "logits/chosen": -2.8423211574554443, - "logits/rejected": -2.8518404960632324, - "logps/chosen": -342.69873046875, - "logps/rejected": -373.8768005371094, - "loss": 0.0118, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.09304714947938919, - "rewards/margins": 8.714309692382812, - "rewards/rejected": -8.807355880737305, + "epoch": 1.34, + "learning_rate": 3.07037674507554e-07, + "logits/chosen": -2.8087925910949707, + "logits/rejected": -2.8416666984558105, + "logps/chosen": -226.8069610595703, + "logps/rejected": -407.0137939453125, + "loss": 0.0728, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4006208181381226, + "rewards/margins": 7.8127875328063965, + "rewards/rejected": -6.412166595458984, "step": 2600 }, { - "epoch": 2.69, - "eval_logits/chosen": -2.796252965927124, - "eval_logits/rejected": -2.776649236679077, - "eval_logps/chosen": -377.9178771972656, - "eval_logps/rejected": -346.03851318359375, - "eval_loss": 0.6936560273170471, - "eval_rewards/accuracies": 0.765999972820282, - "eval_rewards/chosen": -2.505204200744629, - "eval_rewards/margins": 3.686720609664917, - "eval_rewards/rejected": -6.191924095153809, - "eval_runtime": 499.6825, - "eval_samples_per_second": 4.003, - "eval_steps_per_second": 0.5, + "epoch": 1.34, + "eval_logits/chosen": -2.8015549182891846, + "eval_logits/rejected": -2.86586594581604, + "eval_logps/chosen": -253.48963928222656, + "eval_logps/rejected": -326.0744323730469, + "eval_loss": 0.5166017413139343, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -0.5809494256973267, + "eval_rewards/margins": 4.412755012512207, + "eval_rewards/rejected": -4.993704319000244, + "eval_runtime": 278.9732, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.448, "step": 2600 }, { - "epoch": 2.7, - "learning_rate": 5.6257175660160735e-08, - "logits/chosen": -2.8501954078674316, - "logits/rejected": -2.8585927486419678, - "logps/chosen": -330.80767822265625, - "logps/rejected": -349.1222229003906, - "loss": 0.0099, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.3493829369544983, - "rewards/margins": 8.983522415161133, - "rewards/rejected": -8.634138107299805, + "epoch": 1.35, + "learning_rate": 3.060814687320711e-07, + "logits/chosen": -2.804701089859009, + "logits/rejected": -2.8087198734283447, + "logps/chosen": -262.0416564941406, + "logps/rejected": -343.32769775390625, + "loss": 0.0779, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0394731760025024, + "rewards/margins": 7.696552276611328, + "rewards/rejected": -6.657078742980957, "step": 2610 }, { - "epoch": 2.71, - "learning_rate": 5.4343666283964784e-08, - "logits/chosen": -2.84842586517334, - "logits/rejected": -2.8621630668640137, - "logps/chosen": -339.36761474609375, - "logps/rejected": -352.922607421875, - "loss": 0.0115, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.036901332437992096, - "rewards/margins": 9.127528190612793, - "rewards/rejected": -9.164429664611816, + "epoch": 1.35, + "learning_rate": 3.0512526295658824e-07, + "logits/chosen": -2.7707972526550293, + "logits/rejected": -2.8042078018188477, + "logps/chosen": -240.3668975830078, + "logps/rejected": -345.0741271972656, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2935864925384521, + "rewards/margins": 8.221081733703613, + "rewards/rejected": -6.92749547958374, "step": 2620 }, { - "epoch": 2.72, - "learning_rate": 5.243015690776884e-08, - "logits/chosen": -2.8185672760009766, - "logits/rejected": -2.8214826583862305, - "logps/chosen": -354.1401672363281, - "logps/rejected": -386.0504455566406, - "loss": 0.0123, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.03651510551571846, - "rewards/margins": 9.25926685333252, - "rewards/rejected": -9.295781135559082, - "step": 2630 + "epoch": 1.36, + "learning_rate": 3.0416905718110536e-07, + "logits/chosen": -2.9000189304351807, + "logits/rejected": -2.9776604175567627, + "logps/chosen": -212.4930877685547, + "logps/rejected": -303.805908203125, + "loss": 0.1076, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9751327633857727, + "rewards/margins": 7.801082611083984, + "rewards/rejected": -6.825949192047119, + "step": 2630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0321285140562247e-07, + "logits/chosen": -2.8881120681762695, + "logits/rejected": -2.9165701866149902, + "logps/chosen": -246.6224365234375, + "logps/rejected": -306.96014404296875, + "loss": 0.092, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.041773706674575806, + "rewards/margins": 6.964110374450684, + "rewards/rejected": -6.922336578369141, + "step": 2640 + }, + { + "epoch": 1.37, + "learning_rate": 3.022566456301396e-07, + "logits/chosen": -2.865769147872925, + "logits/rejected": -2.8971917629241943, + "logps/chosen": -223.96884155273438, + "logps/rejected": -329.19268798828125, + "loss": 0.1457, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.885826587677002, + "rewards/margins": 7.315209865570068, + "rewards/rejected": -6.429383754730225, + "step": 2650 + }, + { + "epoch": 1.37, + "learning_rate": 3.013004398546567e-07, + "logits/chosen": -2.846896171569824, + "logits/rejected": -2.8020544052124023, + "logps/chosen": -225.41665649414062, + "logps/rejected": -355.18658447265625, + "loss": 0.104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7547341585159302, + "rewards/margins": 7.292550563812256, + "rewards/rejected": -6.537816047668457, + "step": 2660 + }, + { + "epoch": 1.38, + "learning_rate": 3.003442340791738e-07, + "logits/chosen": -2.9464218616485596, + "logits/rejected": -2.9827699661254883, + "logps/chosen": -226.2906494140625, + "logps/rejected": -321.0039978027344, + "loss": 0.0758, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8007649183273315, + "rewards/margins": 7.7752275466918945, + "rewards/rejected": -6.974462032318115, + "step": 2670 + }, + { + "epoch": 1.38, + "learning_rate": 2.9938802830369093e-07, + "logits/chosen": -2.916825771331787, + "logits/rejected": -2.9472413063049316, + "logps/chosen": -264.4847412109375, + "logps/rejected": -352.3399353027344, + "loss": 0.1488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8589404225349426, + "rewards/margins": 8.047723770141602, + "rewards/rejected": -7.188782691955566, + "step": 2680 + }, + { + "epoch": 1.39, + "learning_rate": 2.9843182252820805e-07, + "logits/chosen": -2.893075704574585, + "logits/rejected": -2.9288330078125, + "logps/chosen": -200.18128967285156, + "logps/rejected": -332.02392578125, + "loss": 0.097, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.269403100013733, + "rewards/margins": 8.271394729614258, + "rewards/rejected": -7.001992702484131, + "step": 2690 + }, + { + "epoch": 1.39, + "learning_rate": 2.974756167527252e-07, + "logits/chosen": -2.94435715675354, + "logits/rejected": -2.9935834407806396, + "logps/chosen": -218.6113739013672, + "logps/rejected": -337.3885803222656, + "loss": 0.0844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4172857403755188, + "rewards/margins": 6.798937797546387, + "rewards/rejected": -6.38165283203125, + "step": 2700 + }, + { + "epoch": 1.39, + "eval_logits/chosen": -2.8305044174194336, + "eval_logits/rejected": -2.8900794982910156, + "eval_logps/chosen": -253.89149475097656, + "eval_logps/rejected": -322.5744323730469, + "eval_loss": 0.4835141599178314, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -0.6211313009262085, + "eval_rewards/margins": 4.022569179534912, + "eval_rewards/rejected": -4.64370059967041, + "eval_runtime": 278.7686, + "eval_samples_per_second": 7.174, + "eval_steps_per_second": 0.448, + "step": 2700 + }, + { + "epoch": 1.4, + "learning_rate": 2.9651941097724233e-07, + "logits/chosen": -2.9109933376312256, + "logits/rejected": -2.9616150856018066, + "logps/chosen": -247.9290008544922, + "logps/rejected": -294.053466796875, + "loss": 0.1869, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8045139312744141, + "rewards/margins": 6.199316501617432, + "rewards/rejected": -5.394802093505859, + "step": 2710 + }, + { + "epoch": 1.4, + "learning_rate": 2.9556320520175945e-07, + "logits/chosen": -2.915855884552002, + "logits/rejected": -2.9131040573120117, + "logps/chosen": -244.89242553710938, + "logps/rejected": -310.23931884765625, + "loss": 0.1395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6600807905197144, + "rewards/margins": 7.267942905426025, + "rewards/rejected": -6.6078619956970215, + "step": 2720 + }, + { + "epoch": 1.41, + "learning_rate": 2.946069994262765e-07, + "logits/chosen": -2.9800467491149902, + "logits/rejected": -2.9676592350006104, + "logps/chosen": -240.47412109375, + "logps/rejected": -379.64306640625, + "loss": 0.1346, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6723322868347168, + "rewards/margins": 7.420382499694824, + "rewards/rejected": -6.748049736022949, + "step": 2730 + }, + { + "epoch": 1.41, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": -2.9388279914855957, + "logits/rejected": -2.9169228076934814, + "logps/chosen": -240.00277709960938, + "logps/rejected": -291.1402282714844, + "loss": 0.0738, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0642039775848389, + "rewards/margins": 7.567967891693115, + "rewards/rejected": -6.5037641525268555, + "step": 2740 + }, + { + "epoch": 1.42, + "learning_rate": 2.9269458787531074e-07, + "logits/chosen": -2.854447603225708, + "logits/rejected": -2.8605027198791504, + "logps/chosen": -214.74441528320312, + "logps/rejected": -344.01983642578125, + "loss": 0.1133, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8734567761421204, + "rewards/margins": 6.573145866394043, + "rewards/rejected": -5.699689865112305, + "step": 2750 + }, + { + "epoch": 1.42, + "learning_rate": 2.9173838209982786e-07, + "logits/chosen": -2.8964381217956543, + "logits/rejected": -2.9152231216430664, + "logps/chosen": -248.6080780029297, + "logps/rejected": -294.92474365234375, + "loss": 0.0746, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7927258610725403, + "rewards/margins": 7.022311210632324, + "rewards/rejected": -6.229586601257324, + "step": 2760 + }, + { + "epoch": 1.43, + "learning_rate": 2.90782176324345e-07, + "logits/chosen": -2.9137072563171387, + "logits/rejected": -2.9111227989196777, + "logps/chosen": -331.26812744140625, + "logps/rejected": -336.87799072265625, + "loss": 0.1427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7486933469772339, + "rewards/margins": 7.747340202331543, + "rewards/rejected": -6.998647212982178, + "step": 2770 + }, + { + "epoch": 1.44, + "learning_rate": 2.898259705488621e-07, + "logits/chosen": -2.950000047683716, + "logits/rejected": -3.0376858711242676, + "logps/chosen": -167.78564453125, + "logps/rejected": -316.28973388671875, + "loss": 0.0717, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9677039980888367, + "rewards/margins": 7.978043556213379, + "rewards/rejected": -7.010340213775635, + "step": 2780 + }, + { + "epoch": 1.44, + "learning_rate": 2.888697647733792e-07, + "logits/chosen": -2.950089693069458, + "logits/rejected": -2.9753165245056152, + "logps/chosen": -271.12347412109375, + "logps/rejected": -380.61383056640625, + "loss": 0.1185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3904229402542114, + "rewards/margins": 9.421290397644043, + "rewards/rejected": -8.030868530273438, + "step": 2790 + }, + { + "epoch": 1.45, + "learning_rate": 2.879135589978963e-07, + "logits/chosen": -2.8828554153442383, + "logits/rejected": -2.95314359664917, + "logps/chosen": -220.3723602294922, + "logps/rejected": -351.38153076171875, + "loss": 0.0733, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.192879319190979, + "rewards/margins": 7.078371524810791, + "rewards/rejected": -5.885491371154785, + "step": 2800 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.8813791275024414, + "eval_logits/rejected": -2.931053876876831, + "eval_logps/chosen": -249.5429229736328, + "eval_logps/rejected": -317.8975524902344, + "eval_loss": 0.47381946444511414, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -0.18627841770648956, + "eval_rewards/margins": 3.9897348880767822, + "eval_rewards/rejected": -4.176013469696045, + "eval_runtime": 278.3124, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.449, + "step": 2800 + }, + { + "epoch": 1.45, + "learning_rate": 2.8695735322241344e-07, + "logits/chosen": -2.9480504989624023, + "logits/rejected": -2.9745919704437256, + "logps/chosen": -222.18984985351562, + "logps/rejected": -341.6588439941406, + "loss": 0.0824, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.742396593093872, + "rewards/margins": 8.565423011779785, + "rewards/rejected": -6.823026180267334, + "step": 2810 + }, + { + "epoch": 1.46, + "learning_rate": 2.8600114744693055e-07, + "logits/chosen": -2.8825483322143555, + "logits/rejected": -2.901355266571045, + "logps/chosen": -202.0137176513672, + "logps/rejected": -344.7021789550781, + "loss": 0.0785, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.592545986175537, + "rewards/margins": 8.239890098571777, + "rewards/rejected": -6.64734411239624, + "step": 2820 + }, + { + "epoch": 1.46, + "learning_rate": 2.8504494167144767e-07, + "logits/chosen": -2.945234775543213, + "logits/rejected": -2.946733236312866, + "logps/chosen": -282.22637939453125, + "logps/rejected": -344.6810302734375, + "loss": 0.0707, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0531352758407593, + "rewards/margins": 6.8026628494262695, + "rewards/rejected": -5.749527454376221, + "step": 2830 + }, + { + "epoch": 1.47, + "learning_rate": 2.8408873589596484e-07, + "logits/chosen": -2.9102578163146973, + "logits/rejected": -2.9559600353240967, + "logps/chosen": -233.95407104492188, + "logps/rejected": -308.8085632324219, + "loss": 0.0953, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3472100496292114, + "rewards/margins": 7.695229530334473, + "rewards/rejected": -6.348019599914551, + "step": 2840 + }, + { + "epoch": 1.47, + "learning_rate": 2.8313253012048195e-07, + "logits/chosen": -2.8096060752868652, + "logits/rejected": -2.82187557220459, + "logps/chosen": -232.92648315429688, + "logps/rejected": -363.13311767578125, + "loss": 0.1149, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.446784496307373, + "rewards/margins": 7.554309844970703, + "rewards/rejected": -6.107525825500488, + "step": 2850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8217632434499907e-07, + "logits/chosen": -2.884981632232666, + "logits/rejected": -2.9340274333953857, + "logps/chosen": -282.1310729980469, + "logps/rejected": -352.1937561035156, + "loss": 0.0681, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.166867971420288, + "rewards/margins": 7.277437686920166, + "rewards/rejected": -6.110569000244141, + "step": 2860 + }, + { + "epoch": 1.48, + "learning_rate": 2.812201185695162e-07, + "logits/chosen": -2.9189248085021973, + "logits/rejected": -2.939964771270752, + "logps/chosen": -189.5813751220703, + "logps/rejected": -333.9523010253906, + "loss": 0.0679, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.9171149730682373, + "rewards/margins": 7.77141809463501, + "rewards/rejected": -5.854302406311035, + "step": 2870 + }, + { + "epoch": 1.49, + "learning_rate": 2.802639127940333e-07, + "logits/chosen": -2.889430046081543, + "logits/rejected": -2.902904748916626, + "logps/chosen": -226.6869659423828, + "logps/rejected": -277.851806640625, + "loss": 0.1103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.285125732421875, + "rewards/margins": 7.303654670715332, + "rewards/rejected": -6.018527507781982, + "step": 2880 + }, + { + "epoch": 1.49, + "learning_rate": 2.7930770701855036e-07, + "logits/chosen": -2.9159200191497803, + "logits/rejected": -2.9363036155700684, + "logps/chosen": -252.4503936767578, + "logps/rejected": -314.83441162109375, + "loss": 0.0992, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2371034622192383, + "rewards/margins": 7.61843204498291, + "rewards/rejected": -6.381328582763672, + "step": 2890 + }, + { + "epoch": 1.5, + "learning_rate": 2.783515012430675e-07, + "logits/chosen": -2.8979902267456055, + "logits/rejected": -2.880366086959839, + "logps/chosen": -231.14559936523438, + "logps/rejected": -334.34869384765625, + "loss": 0.1837, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5941873788833618, + "rewards/margins": 7.379087924957275, + "rewards/rejected": -5.784900188446045, + "step": 2900 + }, + { + "epoch": 1.5, + "eval_logits/chosen": -2.8720028400421143, + "eval_logits/rejected": -2.9294917583465576, + "eval_logps/chosen": -247.88092041015625, + "eval_logps/rejected": -318.89837646484375, + "eval_loss": 0.47644469141960144, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": -0.02007720246911049, + "eval_rewards/margins": 4.2560200691223145, + "eval_rewards/rejected": -4.276097774505615, + "eval_runtime": 278.8337, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 0.448, + "step": 2900 + }, + { + "epoch": 1.5, + "learning_rate": 2.773952954675846e-07, + "logits/chosen": -2.932246446609497, + "logits/rejected": -2.9477548599243164, + "logps/chosen": -220.763916015625, + "logps/rejected": -286.6198425292969, + "loss": 0.1087, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.352697730064392, + "rewards/margins": 6.958559989929199, + "rewards/rejected": -5.605862140655518, + "step": 2910 + }, + { + "epoch": 1.51, + "learning_rate": 2.764390896921017e-07, + "logits/chosen": -2.934304714202881, + "logits/rejected": -2.936347723007202, + "logps/chosen": -256.7560729980469, + "logps/rejected": -320.73223876953125, + "loss": 0.117, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7255386114120483, + "rewards/margins": 7.111201286315918, + "rewards/rejected": -6.38566255569458, + "step": 2920 + }, + { + "epoch": 1.51, + "learning_rate": 2.754828839166188e-07, + "logits/chosen": -2.8261325359344482, + "logits/rejected": -2.9312081336975098, + "logps/chosen": -233.1983642578125, + "logps/rejected": -311.11590576171875, + "loss": 0.0778, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7787030935287476, + "rewards/margins": 9.04845142364502, + "rewards/rejected": -7.269747734069824, + "step": 2930 + }, + { + "epoch": 1.52, + "learning_rate": 2.7452667814113594e-07, + "logits/chosen": -2.989229202270508, + "logits/rejected": -2.977133274078369, + "logps/chosen": -293.3673400878906, + "logps/rejected": -334.6380310058594, + "loss": 0.0789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3550571203231812, + "rewards/margins": 6.886478424072266, + "rewards/rejected": -5.531420707702637, + "step": 2940 + }, + { + "epoch": 1.52, + "learning_rate": 2.7357047236565306e-07, + "logits/chosen": -2.8399806022644043, + "logits/rejected": -2.9234249591827393, + "logps/chosen": -234.34567260742188, + "logps/rejected": -336.1915588378906, + "loss": 0.0704, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.254919171333313, + "rewards/margins": 7.959687232971191, + "rewards/rejected": -6.704766750335693, + "step": 2950 + }, + { + "epoch": 1.53, + "learning_rate": 2.7261426659017017e-07, + "logits/chosen": -2.92905855178833, + "logits/rejected": -2.957266330718994, + "logps/chosen": -282.38031005859375, + "logps/rejected": -350.7286682128906, + "loss": 0.1406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6080772280693054, + "rewards/margins": 7.091387748718262, + "rewards/rejected": -6.483310699462891, + "step": 2960 + }, + { + "epoch": 1.53, + "learning_rate": 2.716580608146873e-07, + "logits/chosen": -2.959866762161255, + "logits/rejected": -2.978870391845703, + "logps/chosen": -312.2777404785156, + "logps/rejected": -347.9805603027344, + "loss": 0.0691, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.632709264755249, + "rewards/margins": 8.156556129455566, + "rewards/rejected": -6.523846626281738, + "step": 2970 + }, + { + "epoch": 1.54, + "learning_rate": 2.7070185503920446e-07, + "logits/chosen": -2.7604541778564453, + "logits/rejected": -2.850285053253174, + "logps/chosen": -251.9482421875, + "logps/rejected": -328.6760559082031, + "loss": 0.0711, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5054420232772827, + "rewards/margins": 7.880296230316162, + "rewards/rejected": -6.3748555183410645, + "step": 2980 + }, + { + "epoch": 1.54, + "learning_rate": 2.6974564926372157e-07, + "logits/chosen": -2.944751739501953, + "logits/rejected": -2.9440500736236572, + "logps/chosen": -243.61813354492188, + "logps/rejected": -359.72650146484375, + "loss": 0.0765, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.8091905117034912, + "rewards/margins": 8.164262771606445, + "rewards/rejected": -6.355072975158691, + "step": 2990 + }, + { + "epoch": 1.55, + "learning_rate": 2.687894434882387e-07, + "logits/chosen": -2.9194934368133545, + "logits/rejected": -2.926443099975586, + "logps/chosen": -212.02090454101562, + "logps/rejected": -273.5159912109375, + "loss": 0.2113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5424438714981079, + "rewards/margins": 5.714383602142334, + "rewards/rejected": -5.171939373016357, + "step": 3000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.843538999557495, + "eval_logits/rejected": -2.8977818489074707, + "eval_logps/chosen": -248.24984741210938, + "eval_logps/rejected": -315.9092712402344, + "eval_loss": 0.470895916223526, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -0.056969307363033295, + "eval_rewards/margins": 3.920220375061035, + "eval_rewards/rejected": -3.9771900177001953, + "eval_runtime": 278.5009, + "eval_samples_per_second": 7.181, + "eval_steps_per_second": 0.449, + "step": 3000 + }, + { + "epoch": 1.55, + "learning_rate": 2.678332377127558e-07, + "logits/chosen": -2.9059603214263916, + "logits/rejected": -2.879521369934082, + "logps/chosen": -255.6497039794922, + "logps/rejected": -346.8174743652344, + "loss": 0.0754, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9771935939788818, + "rewards/margins": 9.132074356079102, + "rewards/rejected": -7.154881477355957, + "step": 3010 + }, + { + "epoch": 1.56, + "learning_rate": 2.668770319372729e-07, + "logits/chosen": -2.8338193893432617, + "logits/rejected": -2.904914140701294, + "logps/chosen": -186.17552185058594, + "logps/rejected": -281.7873229980469, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6421797275543213, + "rewards/margins": 7.126562595367432, + "rewards/rejected": -5.484382629394531, + "step": 3020 + }, + { + "epoch": 1.56, + "learning_rate": 2.6592082616179004e-07, + "logits/chosen": -2.760566234588623, + "logits/rejected": -2.785473585128784, + "logps/chosen": -195.1143341064453, + "logps/rejected": -304.75128173828125, + "loss": 0.0727, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.42879682779312134, + "rewards/margins": 6.535356044769287, + "rewards/rejected": -6.1065592765808105, + "step": 3030 + }, + { + "epoch": 1.57, + "learning_rate": 2.649646203863071e-07, + "logits/chosen": -2.8217978477478027, + "logits/rejected": -2.904336929321289, + "logps/chosen": -247.8555450439453, + "logps/rejected": -356.73992919921875, + "loss": 0.1016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.409906029701233, + "rewards/margins": 7.027436256408691, + "rewards/rejected": -5.617530345916748, + "step": 3040 + }, + { + "epoch": 1.57, + "learning_rate": 2.640084146108242e-07, + "logits/chosen": -2.8797459602355957, + "logits/rejected": -2.927067279815674, + "logps/chosen": -301.1026306152344, + "logps/rejected": -334.0341491699219, + "loss": 0.0747, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.892419159412384, + "rewards/margins": 7.652726650238037, + "rewards/rejected": -6.760307312011719, + "step": 3050 + }, + { + "epoch": 1.58, + "learning_rate": 2.6305220883534133e-07, + "logits/chosen": -2.8873353004455566, + "logits/rejected": -2.894935369491577, + "logps/chosen": -186.02676391601562, + "logps/rejected": -322.01812744140625, + "loss": 0.082, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6815090775489807, + "rewards/margins": 7.164406776428223, + "rewards/rejected": -6.4828972816467285, + "step": 3060 + }, + { + "epoch": 1.58, + "learning_rate": 2.6209600305985845e-07, + "logits/chosen": -2.846514940261841, + "logits/rejected": -2.900930166244507, + "logps/chosen": -227.4207305908203, + "logps/rejected": -353.51153564453125, + "loss": 0.1415, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1509945392608643, + "rewards/margins": 8.626825332641602, + "rewards/rejected": -7.475831031799316, + "step": 3070 + }, + { + "epoch": 1.59, + "learning_rate": 2.6113979728437556e-07, + "logits/chosen": -2.9146270751953125, + "logits/rejected": -2.9426653385162354, + "logps/chosen": -249.2900848388672, + "logps/rejected": -329.2434387207031, + "loss": 0.0853, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4317848682403564, + "rewards/margins": 7.4246826171875, + "rewards/rejected": -5.992897033691406, + "step": 3080 + }, + { + "epoch": 1.6, + "learning_rate": 2.601835915088927e-07, + "logits/chosen": -2.760486125946045, + "logits/rejected": -2.8186051845550537, + "logps/chosen": -211.21728515625, + "logps/rejected": -329.289794921875, + "loss": 0.1681, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4863201677799225, + "rewards/margins": 6.338644027709961, + "rewards/rejected": -5.85232400894165, + "step": 3090 + }, + { + "epoch": 1.6, + "learning_rate": 2.592273857334098e-07, + "logits/chosen": -2.8253026008605957, + "logits/rejected": -2.86600661277771, + "logps/chosen": -195.7429656982422, + "logps/rejected": -351.58770751953125, + "loss": 0.1858, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2111473083496094, + "rewards/margins": 8.60848331451416, + "rewards/rejected": -7.397336006164551, + "step": 3100 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.849841833114624, + "eval_logits/rejected": -2.9042794704437256, + "eval_logps/chosen": -249.63946533203125, + "eval_logps/rejected": -318.3751220703125, + "eval_loss": 0.47693005204200745, + "eval_rewards/accuracies": 0.7960000038146973, + "eval_rewards/chosen": -0.19593170285224915, + "eval_rewards/margins": 4.027840614318848, + "eval_rewards/rejected": -4.2237725257873535, + "eval_runtime": 278.9329, + "eval_samples_per_second": 7.17, + "eval_steps_per_second": 0.448, + "step": 3100 + }, + { + "epoch": 1.61, + "learning_rate": 2.582711799579269e-07, + "logits/chosen": -2.9083123207092285, + "logits/rejected": -2.9558422565460205, + "logps/chosen": -283.11962890625, + "logps/rejected": -353.2391052246094, + "loss": 0.1054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2175567150115967, + "rewards/margins": 7.344391822814941, + "rewards/rejected": -6.126835346221924, + "step": 3110 + }, + { + "epoch": 1.61, + "learning_rate": 2.573149741824441e-07, + "logits/chosen": -2.9085631370544434, + "logits/rejected": -2.9231581687927246, + "logps/chosen": -255.6195831298828, + "logps/rejected": -324.60516357421875, + "loss": 0.1551, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3148778676986694, + "rewards/margins": 7.363683223724365, + "rewards/rejected": -6.048805236816406, + "step": 3120 + }, + { + "epoch": 1.62, + "learning_rate": 2.563587684069612e-07, + "logits/chosen": -2.9189603328704834, + "logits/rejected": -2.9584662914276123, + "logps/chosen": -279.16839599609375, + "logps/rejected": -339.174072265625, + "loss": 0.0702, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0117626190185547, + "rewards/margins": 7.726855278015137, + "rewards/rejected": -6.715092658996582, + "step": 3130 + }, + { + "epoch": 1.62, + "learning_rate": 2.554025626314783e-07, + "logits/chosen": -2.9455857276916504, + "logits/rejected": -2.9266839027404785, + "logps/chosen": -238.67636108398438, + "logps/rejected": -348.72894287109375, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6194132566452026, + "rewards/margins": 8.241477966308594, + "rewards/rejected": -7.62206506729126, + "step": 3140 + }, + { + "epoch": 1.63, + "learning_rate": 2.544463568559954e-07, + "logits/chosen": -2.9469656944274902, + "logits/rejected": -2.9347240924835205, + "logps/chosen": -267.73394775390625, + "logps/rejected": -353.4038391113281, + "loss": 0.0658, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.103703498840332, + "rewards/margins": 8.258222579956055, + "rewards/rejected": -7.154518127441406, + "step": 3150 + }, + { + "epoch": 1.63, + "learning_rate": 2.5349015108051254e-07, + "logits/chosen": -2.8953492641448975, + "logits/rejected": -2.9293885231018066, + "logps/chosen": -231.06106567382812, + "logps/rejected": -328.3216552734375, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0628279447555542, + "rewards/margins": 8.15467643737793, + "rewards/rejected": -7.091848850250244, + "step": 3160 + }, + { + "epoch": 1.64, + "learning_rate": 2.5253394530502966e-07, + "logits/chosen": -2.8384034633636475, + "logits/rejected": -2.867201089859009, + "logps/chosen": -332.78564453125, + "logps/rejected": -383.9205322265625, + "loss": 0.0625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7102988958358765, + "rewards/margins": 7.870628356933594, + "rewards/rejected": -7.160330295562744, + "step": 3170 + }, + { + "epoch": 1.64, + "learning_rate": 2.5157773952954677e-07, + "logits/chosen": -2.939607620239258, + "logits/rejected": -2.9498610496520996, + "logps/chosen": -307.7158203125, + "logps/rejected": -341.71466064453125, + "loss": 0.071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9481692314147949, + "rewards/margins": 7.467158317565918, + "rewards/rejected": -6.518989562988281, + "step": 3180 + }, + { + "epoch": 1.65, + "learning_rate": 2.506215337540639e-07, + "logits/chosen": -2.8836703300476074, + "logits/rejected": -2.895805835723877, + "logps/chosen": -208.77883911132812, + "logps/rejected": -361.16754150390625, + "loss": 0.1088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1989336013793945, + "rewards/margins": 7.9919114112854, + "rewards/rejected": -6.792977809906006, + "step": 3190 + }, + { + "epoch": 1.65, + "learning_rate": 2.4966532797858095e-07, + "logits/chosen": -2.8527767658233643, + "logits/rejected": -2.8542492389678955, + "logps/chosen": -248.8079376220703, + "logps/rejected": -321.9994201660156, + "loss": 0.095, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9598841667175293, + "rewards/margins": 7.442416191101074, + "rewards/rejected": -6.482532501220703, + "step": 3200 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.8687751293182373, + "eval_logits/rejected": -2.928819417953491, + "eval_logps/chosen": -250.7627410888672, + "eval_logps/rejected": -319.1705017089844, + "eval_loss": 0.4938603341579437, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -0.3082582652568817, + "eval_rewards/margins": 3.9950480461120605, + "eval_rewards/rejected": -4.303306579589844, + "eval_runtime": 278.8397, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 0.448, + "step": 3200 + }, + { + "epoch": 1.66, + "learning_rate": 2.4870912220309807e-07, + "logits/chosen": -2.84702467918396, + "logits/rejected": -2.915963649749756, + "logps/chosen": -257.31219482421875, + "logps/rejected": -324.8391418457031, + "loss": 0.0766, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4782533645629883, + "rewards/margins": 7.525382995605469, + "rewards/rejected": -6.0471296310424805, + "step": 3210 + }, + { + "epoch": 1.66, + "learning_rate": 2.477529164276152e-07, + "logits/chosen": -2.9667117595672607, + "logits/rejected": -2.967026710510254, + "logps/chosen": -233.61123657226562, + "logps/rejected": -337.34967041015625, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9073492288589478, + "rewards/margins": 6.835887908935547, + "rewards/rejected": -5.9285383224487305, + "step": 3220 + }, + { + "epoch": 1.67, + "learning_rate": 2.4679671065213235e-07, + "logits/chosen": -2.899747133255005, + "logits/rejected": -2.9681437015533447, + "logps/chosen": -247.35348510742188, + "logps/rejected": -374.6236572265625, + "loss": 0.0942, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.231717824935913, + "rewards/margins": 7.991697788238525, + "rewards/rejected": -6.759980201721191, + "step": 3230 + }, + { + "epoch": 1.67, + "learning_rate": 2.4584050487664947e-07, + "logits/chosen": -2.859067440032959, + "logits/rejected": -2.8838469982147217, + "logps/chosen": -246.4993438720703, + "logps/rejected": -319.4035339355469, + "loss": 0.1055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8468250036239624, + "rewards/margins": 8.415315628051758, + "rewards/rejected": -6.568489074707031, + "step": 3240 + }, + { + "epoch": 1.68, + "learning_rate": 2.448842991011666e-07, + "logits/chosen": -2.8728671073913574, + "logits/rejected": -2.8752236366271973, + "logps/chosen": -225.12265014648438, + "logps/rejected": -329.4236755371094, + "loss": 0.1032, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2581980228424072, + "rewards/margins": 7.4553961753845215, + "rewards/rejected": -6.197198390960693, + "step": 3250 + }, + { + "epoch": 1.68, + "learning_rate": 2.439280933256837e-07, + "logits/chosen": -2.853830099105835, + "logits/rejected": -2.8993842601776123, + "logps/chosen": -206.5619659423828, + "logps/rejected": -349.40966796875, + "loss": 0.0579, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0377991199493408, + "rewards/margins": 7.783722877502441, + "rewards/rejected": -6.7459235191345215, + "step": 3260 + }, + { + "epoch": 1.69, + "learning_rate": 2.429718875502008e-07, + "logits/chosen": -2.723466634750366, + "logits/rejected": -2.810455799102783, + "logps/chosen": -234.6430206298828, + "logps/rejected": -392.9654541015625, + "loss": 0.0802, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2275792360305786, + "rewards/margins": 8.184759140014648, + "rewards/rejected": -6.957180023193359, + "step": 3270 + }, + { + "epoch": 1.69, + "learning_rate": 2.420156817747179e-07, + "logits/chosen": -2.819467067718506, + "logits/rejected": -2.7857964038848877, + "logps/chosen": -255.324951171875, + "logps/rejected": -285.57769775390625, + "loss": 0.0542, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5177501440048218, + "rewards/margins": 7.244959831237793, + "rewards/rejected": -5.727210521697998, + "step": 3280 + }, + { + "epoch": 1.7, + "learning_rate": 2.41059475999235e-07, + "logits/chosen": -2.8438751697540283, + "logits/rejected": -2.8182473182678223, + "logps/chosen": -228.69528198242188, + "logps/rejected": -317.17242431640625, + "loss": 0.1373, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.5363596677780151, + "rewards/margins": 6.843097686767578, + "rewards/rejected": -6.306737899780273, + "step": 3290 + }, + { + "epoch": 1.7, + "learning_rate": 2.4010327022375216e-07, + "logits/chosen": -2.859078884124756, + "logits/rejected": -2.923827648162842, + "logps/chosen": -244.914794921875, + "logps/rejected": -357.98052978515625, + "loss": 0.1147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0887374877929688, + "rewards/margins": 7.975939750671387, + "rewards/rejected": -6.887202262878418, + "step": 3300 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -2.848367214202881, + "eval_logits/rejected": -2.91117787361145, + "eval_logps/chosen": -252.27926635742188, + "eval_logps/rejected": -323.21832275390625, + "eval_loss": 0.48966941237449646, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -0.4599113464355469, + "eval_rewards/margins": 4.2481770515441895, + "eval_rewards/rejected": -4.708088397979736, + "eval_runtime": 278.765, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 0.448, + "step": 3300 + }, + { + "epoch": 1.71, + "learning_rate": 2.391470644482693e-07, + "logits/chosen": -2.8616204261779785, + "logits/rejected": -2.943343162536621, + "logps/chosen": -249.031494140625, + "logps/rejected": -359.6810607910156, + "loss": 0.1186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4715783596038818, + "rewards/margins": 8.211606979370117, + "rewards/rejected": -6.740028381347656, + "step": 3310 + }, + { + "epoch": 1.71, + "learning_rate": 2.3819085867278636e-07, + "logits/chosen": -2.801548480987549, + "logits/rejected": -2.8232216835021973, + "logps/chosen": -170.48995971679688, + "logps/rejected": -271.6853942871094, + "loss": 0.0903, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2638075053691864, + "rewards/margins": 6.040502548217773, + "rewards/rejected": -5.7766947746276855, + "step": 3320 + }, + { + "epoch": 1.72, + "learning_rate": 2.3723465289730348e-07, + "logits/chosen": -2.888091564178467, + "logits/rejected": -2.9160046577453613, + "logps/chosen": -257.8720703125, + "logps/rejected": -289.5814208984375, + "loss": 0.0955, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3228559494018555, + "rewards/margins": 6.6165056228637695, + "rewards/rejected": -5.293649673461914, + "step": 3330 + }, + { + "epoch": 1.72, + "learning_rate": 2.362784471218206e-07, + "logits/chosen": -2.8482556343078613, + "logits/rejected": -2.866028308868408, + "logps/chosen": -247.8570098876953, + "logps/rejected": -353.5355224609375, + "loss": 0.0629, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.318343162536621, + "rewards/margins": 7.900382041931152, + "rewards/rejected": -6.582038879394531, + "step": 3340 + }, + { + "epoch": 1.73, + "learning_rate": 2.353222413463377e-07, + "logits/chosen": -2.8011178970336914, + "logits/rejected": -2.826658248901367, + "logps/chosen": -213.5056610107422, + "logps/rejected": -359.37542724609375, + "loss": 0.0429, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0736242532730103, + "rewards/margins": 7.892706394195557, + "rewards/rejected": -6.819081783294678, + "step": 3350 + }, + { + "epoch": 1.73, + "learning_rate": 2.3436603557085483e-07, + "logits/chosen": -2.8530900478363037, + "logits/rejected": -2.8854198455810547, + "logps/chosen": -218.562255859375, + "logps/rejected": -370.8888244628906, + "loss": 0.0755, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5468239784240723, + "rewards/margins": 8.230497360229492, + "rewards/rejected": -6.683673858642578, + "step": 3360 + }, + { + "epoch": 1.74, + "learning_rate": 2.3340982979537197e-07, + "logits/chosen": -2.828904390335083, + "logits/rejected": -2.8650832176208496, + "logps/chosen": -292.4428405761719, + "logps/rejected": -395.83441162109375, + "loss": 0.1209, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.142470359802246, + "rewards/margins": 8.96367073059082, + "rewards/rejected": -6.821200370788574, + "step": 3370 + }, + { + "epoch": 1.74, + "learning_rate": 2.3245362401988909e-07, + "logits/chosen": -2.8313663005828857, + "logits/rejected": -2.8785340785980225, + "logps/chosen": -261.6875, + "logps/rejected": -338.6850280761719, + "loss": 0.1164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.722820520401001, + "rewards/margins": 8.832094192504883, + "rewards/rejected": -7.109274387359619, + "step": 3380 + }, + { + "epoch": 1.75, + "learning_rate": 2.314974182444062e-07, + "logits/chosen": -2.8746933937072754, + "logits/rejected": -2.916006326675415, + "logps/chosen": -223.5010986328125, + "logps/rejected": -337.25439453125, + "loss": 0.0708, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.14647480845451355, + "rewards/margins": 6.7344183921813965, + "rewards/rejected": -6.880892276763916, + "step": 3390 + }, + { + "epoch": 1.76, + "learning_rate": 2.305412124689233e-07, + "logits/chosen": -2.765967845916748, + "logits/rejected": -2.7947843074798584, + "logps/chosen": -268.5664978027344, + "logps/rejected": -348.4090576171875, + "loss": 0.1677, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8971278071403503, + "rewards/margins": 8.047765731811523, + "rewards/rejected": -7.150639533996582, + "step": 3400 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -2.780897855758667, + "eval_logits/rejected": -2.84079647064209, + "eval_logps/chosen": -255.14529418945312, + "eval_logps/rejected": -327.32879638671875, + "eval_loss": 0.49304431676864624, + "eval_rewards/accuracies": 0.8199999928474426, + "eval_rewards/chosen": -0.7465150356292725, + "eval_rewards/margins": 4.372622489929199, + "eval_rewards/rejected": -5.119137763977051, + "eval_runtime": 278.8693, + "eval_samples_per_second": 7.172, + "eval_steps_per_second": 0.448, + "step": 3400 + }, + { + "epoch": 1.76, + "learning_rate": 2.295850066934404e-07, + "logits/chosen": -2.793631076812744, + "logits/rejected": -2.8191399574279785, + "logps/chosen": -238.7302703857422, + "logps/rejected": -339.9305419921875, + "loss": 0.0874, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6573998332023621, + "rewards/margins": 7.640374660491943, + "rewards/rejected": -6.982974052429199, + "step": 3410 + }, + { + "epoch": 1.77, + "learning_rate": 2.2862880091795752e-07, + "logits/chosen": -2.8327536582946777, + "logits/rejected": -2.8614022731781006, + "logps/chosen": -239.38424682617188, + "logps/rejected": -378.095458984375, + "loss": 0.0628, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9374256134033203, + "rewards/margins": 9.278172492980957, + "rewards/rejected": -8.340746879577637, + "step": 3420 + }, + { + "epoch": 1.77, + "learning_rate": 2.2767259514247464e-07, + "logits/chosen": -2.7702488899230957, + "logits/rejected": -2.834359645843506, + "logps/chosen": -219.39254760742188, + "logps/rejected": -384.39666748046875, + "loss": 0.0825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6664879322052002, + "rewards/margins": 8.466286659240723, + "rewards/rejected": -6.799798011779785, + "step": 3430 + }, + { + "epoch": 1.78, + "learning_rate": 2.2671638936699178e-07, + "logits/chosen": -2.7215263843536377, + "logits/rejected": -2.743380069732666, + "logps/chosen": -258.17010498046875, + "logps/rejected": -365.0523986816406, + "loss": 0.0634, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.524726927280426, + "rewards/margins": 7.3098273277282715, + "rewards/rejected": -6.785101413726807, + "step": 3440 + }, + { + "epoch": 1.78, + "learning_rate": 2.257601835915089e-07, + "logits/chosen": -2.795828104019165, + "logits/rejected": -2.8122973442077637, + "logps/chosen": -330.8063049316406, + "logps/rejected": -366.1406555175781, + "loss": 0.0586, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7863120436668396, + "rewards/margins": 7.602275848388672, + "rewards/rejected": -6.815962791442871, + "step": 3450 + }, + { + "epoch": 1.79, + "learning_rate": 2.24803977816026e-07, + "logits/chosen": -2.7401373386383057, + "logits/rejected": -2.792062997817993, + "logps/chosen": -262.8241271972656, + "logps/rejected": -324.1911926269531, + "loss": 0.0767, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5823167562484741, + "rewards/margins": 8.485211372375488, + "rewards/rejected": -6.902894496917725, + "step": 3460 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384777204054313e-07, + "logits/chosen": -2.840707778930664, + "logits/rejected": -2.8512017726898193, + "logps/chosen": -259.43536376953125, + "logps/rejected": -344.71881103515625, + "loss": 0.0943, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8209340572357178, + "rewards/margins": 8.689406394958496, + "rewards/rejected": -6.868472099304199, + "step": 3470 + }, + { + "epoch": 1.8, + "learning_rate": 2.2289156626506022e-07, + "logits/chosen": -2.7405035495758057, + "logits/rejected": -2.7608306407928467, + "logps/chosen": -306.3294982910156, + "logps/rejected": -349.9527893066406, + "loss": 0.0995, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6940513849258423, + "rewards/margins": 8.543526649475098, + "rewards/rejected": -7.849474906921387, + "step": 3480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193536048957733e-07, + "logits/chosen": -2.891343593597412, + "logits/rejected": -2.896864414215088, + "logps/chosen": -275.199462890625, + "logps/rejected": -365.2381591796875, + "loss": 0.1152, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8797968029975891, + "rewards/margins": 7.490570068359375, + "rewards/rejected": -6.610772609710693, + "step": 3490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2097915471409445e-07, + "logits/chosen": -2.8077056407928467, + "logits/rejected": -2.8688576221466064, + "logps/chosen": -241.04013061523438, + "logps/rejected": -348.31292724609375, + "loss": 0.0581, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4186904430389404, + "rewards/margins": 8.326966285705566, + "rewards/rejected": -6.908276557922363, + "step": 3500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.8191068172454834, + "eval_logits/rejected": -2.874876022338867, + "eval_logps/chosen": -250.5966339111328, + "eval_logps/rejected": -321.31298828125, + "eval_loss": 0.48586151003837585, + "eval_rewards/accuracies": 0.8180000185966492, + "eval_rewards/chosen": -0.29164865612983704, + "eval_rewards/margins": 4.225912094116211, + "eval_rewards/rejected": -4.5175604820251465, + "eval_runtime": 278.7288, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 0.448, + "step": 3500 + }, + { + "epoch": 1.81, + "learning_rate": 2.200229489386116e-07, + "logits/chosen": -2.8601200580596924, + "logits/rejected": -2.905097484588623, + "logps/chosen": -241.131103515625, + "logps/rejected": -370.8861999511719, + "loss": 0.0544, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0310213565826416, + "rewards/margins": 8.371079444885254, + "rewards/rejected": -7.340056419372559, + "step": 3510 + }, + { + "epoch": 1.82, + "learning_rate": 2.190667431631287e-07, + "logits/chosen": -2.8398001194000244, + "logits/rejected": -2.8893535137176514, + "logps/chosen": -271.63861083984375, + "logps/rejected": -412.21075439453125, + "loss": 0.0874, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.676041841506958, + "rewards/margins": 9.154836654663086, + "rewards/rejected": -7.478795051574707, + "step": 3520 + }, + { + "epoch": 1.82, + "learning_rate": 2.1811053738764582e-07, + "logits/chosen": -2.7711901664733887, + "logits/rejected": -2.8552980422973633, + "logps/chosen": -193.1044921875, + "logps/rejected": -350.78619384765625, + "loss": 0.0921, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3733131885528564, + "rewards/margins": 8.322312355041504, + "rewards/rejected": -6.948998928070068, + "step": 3530 + }, + { + "epoch": 1.83, + "learning_rate": 2.1715433161216294e-07, + "logits/chosen": -2.8452248573303223, + "logits/rejected": -2.8975632190704346, + "logps/chosen": -254.40078735351562, + "logps/rejected": -356.9154052734375, + "loss": 0.4464, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0389509201049805, + "rewards/margins": 7.992854118347168, + "rewards/rejected": -6.9539031982421875, + "step": 3540 + }, + { + "epoch": 1.83, + "learning_rate": 2.1619812583668005e-07, + "logits/chosen": -2.9288828372955322, + "logits/rejected": -2.958012104034424, + "logps/chosen": -227.0967559814453, + "logps/rejected": -314.05908203125, + "loss": 0.0829, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8639600872993469, + "rewards/margins": 8.197954177856445, + "rewards/rejected": -7.3339948654174805, + "step": 3550 + }, + { + "epoch": 1.84, + "learning_rate": 2.1524192006119714e-07, + "logits/chosen": -2.892519235610962, + "logits/rejected": -2.915253162384033, + "logps/chosen": -230.73831176757812, + "logps/rejected": -333.3849792480469, + "loss": 0.099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6762139201164246, + "rewards/margins": 7.587038516998291, + "rewards/rejected": -6.910824775695801, + "step": 3560 + }, + { + "epoch": 1.84, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -2.9029574394226074, + "logits/rejected": -2.9076485633850098, + "logps/chosen": -245.4383544921875, + "logps/rejected": -291.6465148925781, + "loss": 0.0601, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6226890087127686, + "rewards/margins": 7.25844669342041, + "rewards/rejected": -6.6357574462890625, + "step": 3570 + }, + { + "epoch": 1.85, + "learning_rate": 2.133295085102314e-07, + "logits/chosen": -2.7956185340881348, + "logits/rejected": -2.822728157043457, + "logps/chosen": -259.4379577636719, + "logps/rejected": -321.2564392089844, + "loss": 0.069, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.7765030860900879, + "rewards/margins": 7.667203426361084, + "rewards/rejected": -6.890700340270996, + "step": 3580 + }, + { + "epoch": 1.85, + "learning_rate": 2.1237330273474851e-07, + "logits/chosen": -2.8928589820861816, + "logits/rejected": -2.904470920562744, + "logps/chosen": -287.71087646484375, + "logps/rejected": -337.8426818847656, + "loss": 0.0637, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0680280923843384, + "rewards/margins": 7.8063483238220215, + "rewards/rejected": -6.738319396972656, + "step": 3590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1141709695926563e-07, + "logits/chosen": -2.8569111824035645, + "logits/rejected": -2.9227359294891357, + "logps/chosen": -224.90072631835938, + "logps/rejected": -317.1742858886719, + "loss": 0.053, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.06578528881073, + "rewards/margins": 8.363664627075195, + "rewards/rejected": -7.297879695892334, + "step": 3600 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.830040216445923, + "eval_logits/rejected": -2.888530731201172, + "eval_logps/chosen": -253.77218627929688, + "eval_logps/rejected": -326.65185546875, + "eval_loss": 0.4978037178516388, + "eval_rewards/accuracies": 0.8220000267028809, + "eval_rewards/chosen": -0.6092034578323364, + "eval_rewards/margins": 4.442237377166748, + "eval_rewards/rejected": -5.051440238952637, + "eval_runtime": 278.5418, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 3600 + }, + { + "epoch": 1.86, + "learning_rate": 2.1046089118378275e-07, + "logits/chosen": -2.866051197052002, + "logits/rejected": -2.8638253211975098, + "logps/chosen": -257.161865234375, + "logps/rejected": -354.1871643066406, + "loss": 0.1621, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5690984725952148, + "rewards/margins": 8.197249412536621, + "rewards/rejected": -7.628150939941406, + "step": 3610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0950468540829986e-07, + "logits/chosen": -2.8620736598968506, + "logits/rejected": -2.8946380615234375, + "logps/chosen": -248.4556121826172, + "logps/rejected": -326.19573974609375, + "loss": 0.109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.479146957397461, + "rewards/margins": 7.966286659240723, + "rewards/rejected": -6.4871392250061035, + "step": 3620 + }, + { + "epoch": 1.87, + "learning_rate": 2.0854847963281698e-07, + "logits/chosen": -2.8275606632232666, + "logits/rejected": -2.8644332885742188, + "logps/chosen": -289.37261962890625, + "logps/rejected": -347.7831726074219, + "loss": 0.1032, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3235790729522705, + "rewards/margins": 7.77987003326416, + "rewards/rejected": -6.456290245056152, + "step": 3630 + }, + { + "epoch": 1.88, + "learning_rate": 2.0759227385733407e-07, + "logits/chosen": -2.7326953411102295, + "logits/rejected": -2.7684574127197266, + "logps/chosen": -280.19427490234375, + "logps/rejected": -348.9992370605469, + "loss": 0.0828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8279417157173157, + "rewards/margins": 7.6720428466796875, + "rewards/rejected": -6.844099998474121, + "step": 3640 + }, + { + "epoch": 1.88, + "learning_rate": 2.066360680818512e-07, + "logits/chosen": -2.8425121307373047, + "logits/rejected": -2.85652494430542, + "logps/chosen": -328.8985595703125, + "logps/rejected": -353.4617004394531, + "loss": 0.1833, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.654839277267456, + "rewards/margins": 8.085906028747559, + "rewards/rejected": -6.43106746673584, + "step": 3650 + }, + { + "epoch": 1.89, + "learning_rate": 2.0567986230636832e-07, + "logits/chosen": -2.82452392578125, + "logits/rejected": -2.8541016578674316, + "logps/chosen": -229.6313018798828, + "logps/rejected": -334.2443542480469, + "loss": 0.0618, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3441283702850342, + "rewards/margins": 8.709617614746094, + "rewards/rejected": -7.365488529205322, + "step": 3660 + }, + { + "epoch": 1.89, + "learning_rate": 2.0472365653088544e-07, + "logits/chosen": -2.8438940048217773, + "logits/rejected": -2.852888345718384, + "logps/chosen": -275.33172607421875, + "logps/rejected": -345.549560546875, + "loss": 0.1453, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8664258718490601, + "rewards/margins": 7.481070041656494, + "rewards/rejected": -6.6146440505981445, + "step": 3670 + }, + { + "epoch": 1.9, + "learning_rate": 2.0376745075540256e-07, + "logits/chosen": -2.86684513092041, + "logits/rejected": -2.9170501232147217, + "logps/chosen": -347.53875732421875, + "logps/rejected": -356.6395568847656, + "loss": 0.085, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8831812739372253, + "rewards/margins": 8.366857528686523, + "rewards/rejected": -7.483675956726074, + "step": 3680 + }, + { + "epoch": 1.91, + "learning_rate": 2.0281124497991967e-07, + "logits/chosen": -2.8049659729003906, + "logits/rejected": -2.8793957233428955, + "logps/chosen": -242.1417999267578, + "logps/rejected": -375.3875427246094, + "loss": 0.0978, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4780528545379639, + "rewards/margins": 8.171385765075684, + "rewards/rejected": -6.693333625793457, + "step": 3690 + }, + { + "epoch": 1.91, + "learning_rate": 2.018550392044368e-07, + "logits/chosen": -2.8545122146606445, + "logits/rejected": -2.8557956218719482, + "logps/chosen": -269.9671936035156, + "logps/rejected": -395.208251953125, + "loss": 0.0603, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7889567613601685, + "rewards/margins": 8.470499038696289, + "rewards/rejected": -7.681540489196777, + "step": 3700 + }, + { + "epoch": 1.91, + "eval_logits/chosen": -2.8074629306793213, + "eval_logits/rejected": -2.8709890842437744, + "eval_logps/chosen": -255.2186737060547, + "eval_logps/rejected": -326.8601989746094, + "eval_loss": 0.48304229974746704, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": -0.7538514137268066, + "eval_rewards/margins": 4.318424224853516, + "eval_rewards/rejected": -5.072276592254639, + "eval_runtime": 278.8882, + "eval_samples_per_second": 7.171, + "eval_steps_per_second": 0.448, + "step": 3700 + }, + { + "epoch": 1.92, + "learning_rate": 2.0089883342895388e-07, + "logits/chosen": -2.8379898071289062, + "logits/rejected": -2.8898370265960693, + "logps/chosen": -261.80242919921875, + "logps/rejected": -306.7249755859375, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4923166036605835, + "rewards/margins": 7.94171667098999, + "rewards/rejected": -6.449400424957275, + "step": 3710 + }, + { + "epoch": 1.92, + "learning_rate": 1.9994262765347102e-07, + "logits/chosen": -2.8545596599578857, + "logits/rejected": -2.857635974884033, + "logps/chosen": -302.4580993652344, + "logps/rejected": -325.986083984375, + "loss": 0.0658, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9443203210830688, + "rewards/margins": 8.127703666687012, + "rewards/rejected": -7.183383941650391, + "step": 3720 + }, + { + "epoch": 1.93, + "learning_rate": 1.9898642187798813e-07, + "logits/chosen": -2.745156764984131, + "logits/rejected": -2.785728931427002, + "logps/chosen": -261.17877197265625, + "logps/rejected": -370.0102844238281, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6731252670288086, + "rewards/margins": 9.124866485595703, + "rewards/rejected": -7.4517412185668945, + "step": 3730 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803021610250525e-07, + "logits/chosen": -2.8896374702453613, + "logits/rejected": -2.9223759174346924, + "logps/chosen": -214.2520294189453, + "logps/rejected": -315.9022216796875, + "loss": 0.1006, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5602417588233948, + "rewards/margins": 6.8746466636657715, + "rewards/rejected": -6.314405918121338, + "step": 3740 + }, + { + "epoch": 1.94, + "learning_rate": 1.9707401032702237e-07, + "logits/chosen": -2.772069215774536, + "logits/rejected": -2.7865991592407227, + "logps/chosen": -257.6865234375, + "logps/rejected": -336.3182373046875, + "loss": 0.0962, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9006959795951843, + "rewards/margins": 7.774443626403809, + "rewards/rejected": -6.8737473487854, + "step": 3750 + }, + { + "epoch": 1.94, + "learning_rate": 1.9611780455153948e-07, + "logits/chosen": -2.8456928730010986, + "logits/rejected": -2.934762477874756, + "logps/chosen": -185.00128173828125, + "logps/rejected": -335.33245849609375, + "loss": 0.0523, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1279864311218262, + "rewards/margins": 7.620774745941162, + "rewards/rejected": -6.4927873611450195, + "step": 3760 + }, + { + "epoch": 1.95, + "learning_rate": 1.951615987760566e-07, + "logits/chosen": -2.7873167991638184, + "logits/rejected": -2.8468079566955566, + "logps/chosen": -234.56436157226562, + "logps/rejected": -342.3611145019531, + "loss": 0.0705, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4214778542518616, + "rewards/margins": 7.9835524559021, + "rewards/rejected": -7.562074184417725, + "step": 3770 + }, + { + "epoch": 1.95, + "learning_rate": 1.942053930005737e-07, + "logits/chosen": -2.926084041595459, + "logits/rejected": -2.9593288898468018, + "logps/chosen": -222.12442016601562, + "logps/rejected": -343.7054748535156, + "loss": 0.0495, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9796302318572998, + "rewards/margins": 9.76245403289795, + "rewards/rejected": -7.7828240394592285, + "step": 3780 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324918722509086e-07, + "logits/chosen": -2.8268990516662598, + "logits/rejected": -2.892279624938965, + "logps/chosen": -268.910888671875, + "logps/rejected": -384.81329345703125, + "loss": 0.1037, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7527690529823303, + "rewards/margins": 8.045644760131836, + "rewards/rejected": -7.29287576675415, + "step": 3790 + }, + { + "epoch": 1.96, + "learning_rate": 1.9229298144960794e-07, + "logits/chosen": -2.876244068145752, + "logits/rejected": -2.9497416019439697, + "logps/chosen": -252.00454711914062, + "logps/rejected": -334.44183349609375, + "loss": 0.1269, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8252018094062805, + "rewards/margins": 7.863356590270996, + "rewards/rejected": -7.038155555725098, + "step": 3800 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.8554041385650635, + "eval_logits/rejected": -2.912135601043701, + "eval_logps/chosen": -252.01141357421875, + "eval_logps/rejected": -321.33148193359375, + "eval_loss": 0.4793297350406647, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -0.43312713503837585, + "eval_rewards/margins": 4.086278438568115, + "eval_rewards/rejected": -4.519405364990234, + "eval_runtime": 278.7658, + "eval_samples_per_second": 7.174, + "eval_steps_per_second": 0.448, + "step": 3800 + }, + { + "epoch": 1.97, + "learning_rate": 1.9133677567412506e-07, + "logits/chosen": -2.876587152481079, + "logits/rejected": -2.9200339317321777, + "logps/chosen": -290.6847839355469, + "logps/rejected": -328.954833984375, + "loss": 0.0773, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8511034250259399, + "rewards/margins": 7.0640459060668945, + "rewards/rejected": -6.212942123413086, + "step": 3810 + }, + { + "epoch": 1.97, + "learning_rate": 1.9038056989864218e-07, + "logits/chosen": -2.860901355743408, + "logits/rejected": -2.9103498458862305, + "logps/chosen": -194.5924072265625, + "logps/rejected": -308.28668212890625, + "loss": 0.0948, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2943288087844849, + "rewards/margins": 6.222657203674316, + "rewards/rejected": -4.928328514099121, + "step": 3820 + }, + { + "epoch": 1.98, + "learning_rate": 1.894243641231593e-07, + "logits/chosen": -2.7092089653015137, + "logits/rejected": -2.7530195713043213, + "logps/chosen": -235.4855194091797, + "logps/rejected": -291.153564453125, + "loss": 0.1392, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6584703922271729, + "rewards/margins": 6.632231712341309, + "rewards/rejected": -5.973761081695557, + "step": 3830 + }, + { + "epoch": 1.98, + "learning_rate": 1.884681583476764e-07, + "logits/chosen": -2.8745875358581543, + "logits/rejected": -2.9260880947113037, + "logps/chosen": -281.93890380859375, + "logps/rejected": -325.87213134765625, + "loss": 0.0558, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5928620100021362, + "rewards/margins": 6.597723484039307, + "rewards/rejected": -6.004860877990723, + "step": 3840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8751195257219352e-07, + "logits/chosen": -2.8994739055633545, + "logits/rejected": -2.9488778114318848, + "logps/chosen": -188.28561401367188, + "logps/rejected": -319.95281982421875, + "loss": 0.0772, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0726709365844727, + "rewards/margins": 6.868597984313965, + "rewards/rejected": -5.79592752456665, + "step": 3850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8655574679671067e-07, + "logits/chosen": -2.879817485809326, + "logits/rejected": -2.9309778213500977, + "logps/chosen": -252.77963256835938, + "logps/rejected": -353.0110168457031, + "loss": 0.1026, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7972162961959839, + "rewards/margins": 7.9078192710876465, + "rewards/rejected": -7.110602378845215, + "step": 3860 + }, + { + "epoch": 2.0, + "learning_rate": 1.8559954102122778e-07, + "logits/chosen": -2.9028608798980713, + "logits/rejected": -2.8901915550231934, + "logps/chosen": -253.7400665283203, + "logps/rejected": -343.6603698730469, + "loss": 0.0992, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.4057618379592896, + "rewards/margins": 7.818563938140869, + "rewards/rejected": -6.412802696228027, + "step": 3870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8464333524574487e-07, + "logits/chosen": -2.8687682151794434, + "logits/rejected": -2.9383254051208496, + "logps/chosen": -229.3580322265625, + "logps/rejected": -317.1125183105469, + "loss": 0.0279, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3208388090133667, + "rewards/margins": 8.077336311340332, + "rewards/rejected": -6.756496429443359, + "step": 3880 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368712947026199e-07, + "logits/chosen": -2.865574359893799, + "logits/rejected": -2.918137550354004, + "logps/chosen": -284.77618408203125, + "logps/rejected": -341.1562805175781, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1757142543792725, + "rewards/margins": 8.95848274230957, + "rewards/rejected": -6.782768249511719, + "step": 3890 + }, + { + "epoch": 2.01, + "learning_rate": 1.827309236947791e-07, + "logits/chosen": -2.792001247406006, + "logits/rejected": -2.859900712966919, + "logps/chosen": -250.72314453125, + "logps/rejected": -352.9478454589844, + "loss": 0.0191, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8500796556472778, + "rewards/margins": 9.03044605255127, + "rewards/rejected": -8.180366516113281, + "step": 3900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -2.824638843536377, + "eval_logits/rejected": -2.885737657546997, + "eval_logps/chosen": -252.56590270996094, + "eval_logps/rejected": -326.02313232421875, + "eval_loss": 0.48027554154396057, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -0.48857322335243225, + "eval_rewards/margins": 4.499995231628418, + "eval_rewards/rejected": -4.988568305969238, + "eval_runtime": 278.5534, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 3900 + }, + { + "epoch": 2.02, + "learning_rate": 1.8177471791929622e-07, + "logits/chosen": -2.786916971206665, + "logits/rejected": -2.8448758125305176, + "logps/chosen": -242.95535278320312, + "logps/rejected": -407.13848876953125, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6383663415908813, + "rewards/margins": 9.484217643737793, + "rewards/rejected": -7.845850944519043, + "step": 3910 + }, + { + "epoch": 2.02, + "learning_rate": 1.8081851214381333e-07, + "logits/chosen": -2.750891923904419, + "logits/rejected": -2.7767796516418457, + "logps/chosen": -272.3475646972656, + "logps/rejected": -429.21502685546875, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9751347303390503, + "rewards/margins": 10.68256950378418, + "rewards/rejected": -9.70743465423584, + "step": 3920 + }, + { + "epoch": 2.03, + "learning_rate": 1.7986230636833047e-07, + "logits/chosen": -2.8120033740997314, + "logits/rejected": -2.8324341773986816, + "logps/chosen": -163.6907196044922, + "logps/rejected": -321.2008972167969, + "loss": 0.0197, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0225517749786377, + "rewards/margins": 8.076213836669922, + "rewards/rejected": -7.0536627769470215, + "step": 3930 + }, + { + "epoch": 2.03, + "learning_rate": 1.789061005928476e-07, + "logits/chosen": -2.8280625343322754, + "logits/rejected": -2.853699207305908, + "logps/chosen": -227.91928100585938, + "logps/rejected": -318.5960998535156, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3885648250579834, + "rewards/margins": 9.088371276855469, + "rewards/rejected": -7.699806213378906, + "step": 3940 + }, + { + "epoch": 2.04, + "learning_rate": 1.7794989481736468e-07, + "logits/chosen": -2.839874267578125, + "logits/rejected": -2.8559255599975586, + "logps/chosen": -265.1712951660156, + "logps/rejected": -363.5159912109375, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3292443752288818, + "rewards/margins": 9.883804321289062, + "rewards/rejected": -8.554559707641602, + "step": 3950 + }, + { + "epoch": 2.04, + "learning_rate": 1.769936890418818e-07, + "logits/chosen": -2.7737081050872803, + "logits/rejected": -2.808875560760498, + "logps/chosen": -267.7574768066406, + "logps/rejected": -366.5141906738281, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.896043062210083, + "rewards/margins": 9.658696174621582, + "rewards/rejected": -7.762652397155762, + "step": 3960 + }, + { + "epoch": 2.05, + "learning_rate": 1.760374832663989e-07, + "logits/chosen": -2.759280204772949, + "logits/rejected": -2.753592014312744, + "logps/chosen": -259.1358947753906, + "logps/rejected": -381.5042724609375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4964958727359772, + "rewards/margins": 8.723490715026855, + "rewards/rejected": -8.226995468139648, + "step": 3970 + }, + { + "epoch": 2.05, + "learning_rate": 1.7508127749091603e-07, + "logits/chosen": -2.800121545791626, + "logits/rejected": -2.839855670928955, + "logps/chosen": -279.0640869140625, + "logps/rejected": -373.6163635253906, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0177298784255981, + "rewards/margins": 9.800603866577148, + "rewards/rejected": -8.782873153686523, + "step": 3980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7412507171543314e-07, + "logits/chosen": -2.786940813064575, + "logits/rejected": -2.836458683013916, + "logps/chosen": -194.54067993164062, + "logps/rejected": -314.28802490234375, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3166208267211914, + "rewards/margins": 9.230585098266602, + "rewards/rejected": -8.913965225219727, + "step": 3990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7316886593995028e-07, + "logits/chosen": -2.807687282562256, + "logits/rejected": -2.8598742485046387, + "logps/chosen": -218.19580078125, + "logps/rejected": -344.6866149902344, + "loss": 0.0168, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6459993720054626, + "rewards/margins": 8.975092887878418, + "rewards/rejected": -8.329092025756836, + "step": 4000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.777517795562744, + "eval_logits/rejected": -2.841887950897217, + "eval_logps/chosen": -257.9146423339844, + "eval_logps/rejected": -337.38818359375, + "eval_loss": 0.5259261727333069, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": -1.0234500169754028, + "eval_rewards/margins": 5.101625442504883, + "eval_rewards/rejected": -6.1250762939453125, + "eval_runtime": 278.3659, + "eval_samples_per_second": 7.185, + "eval_steps_per_second": 0.449, + "step": 4000 + }, + { + "epoch": 2.07, + "learning_rate": 1.722126601644674e-07, + "logits/chosen": -2.800914764404297, + "logits/rejected": -2.830235719680786, + "logps/chosen": -248.7406768798828, + "logps/rejected": -330.05267333984375, + "loss": 0.0173, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9889074563980103, + "rewards/margins": 9.021321296691895, + "rewards/rejected": -8.032414436340332, + "step": 4010 + }, + { + "epoch": 2.08, + "learning_rate": 1.7125645438898452e-07, + "logits/chosen": -2.737515687942505, + "logits/rejected": -2.819586753845215, + "logps/chosen": -243.9829559326172, + "logps/rejected": -350.6963195800781, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6946113705635071, + "rewards/margins": 9.870268821716309, + "rewards/rejected": -9.17565631866455, + "step": 4020 + }, + { + "epoch": 2.08, + "learning_rate": 1.703002486135016e-07, + "logits/chosen": -2.7942397594451904, + "logits/rejected": -2.8367092609405518, + "logps/chosen": -281.71917724609375, + "logps/rejected": -360.45782470703125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3287776708602905, + "rewards/margins": 10.280738830566406, + "rewards/rejected": -8.951960563659668, + "step": 4030 + }, + { + "epoch": 2.09, + "learning_rate": 1.6934404283801872e-07, + "logits/chosen": -2.7896275520324707, + "logits/rejected": -2.77767014503479, + "logps/chosen": -238.7395477294922, + "logps/rejected": -421.7259826660156, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.795906662940979, + "rewards/margins": 10.489312171936035, + "rewards/rejected": -9.693406105041504, + "step": 4040 + }, + { + "epoch": 2.09, + "learning_rate": 1.6838783706253584e-07, + "logits/chosen": -2.7938098907470703, + "logits/rejected": -2.836862564086914, + "logps/chosen": -192.1158447265625, + "logps/rejected": -331.1219787597656, + "loss": 0.0345, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6757541298866272, + "rewards/margins": 9.389961242675781, + "rewards/rejected": -8.714208602905273, + "step": 4050 + }, + { + "epoch": 2.1, + "learning_rate": 1.6743163128705295e-07, + "logits/chosen": -2.7857515811920166, + "logits/rejected": -2.8047680854797363, + "logps/chosen": -237.7971954345703, + "logps/rejected": -362.19476318359375, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.536388099193573, + "rewards/margins": 10.030662536621094, + "rewards/rejected": -9.494275093078613, + "step": 4060 + }, + { + "epoch": 2.1, + "learning_rate": 1.664754255115701e-07, + "logits/chosen": -2.777409076690674, + "logits/rejected": -2.831268548965454, + "logps/chosen": -246.1458740234375, + "logps/rejected": -395.8076171875, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3308227062225342, + "rewards/margins": 10.749163627624512, + "rewards/rejected": -10.418339729309082, + "step": 4070 + }, + { + "epoch": 2.11, + "learning_rate": 1.655192197360872e-07, + "logits/chosen": -2.858748197555542, + "logits/rejected": -2.882028102874756, + "logps/chosen": -250.802001953125, + "logps/rejected": -395.7652587890625, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7184736132621765, + "rewards/margins": 10.270122528076172, + "rewards/rejected": -9.55164909362793, + "step": 4080 + }, + { + "epoch": 2.11, + "learning_rate": 1.6456301396060433e-07, + "logits/chosen": -2.810091018676758, + "logits/rejected": -2.8521618843078613, + "logps/chosen": -316.01104736328125, + "logps/rejected": -424.2666931152344, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11161376535892487, + "rewards/margins": 10.254980087280273, + "rewards/rejected": -10.143366813659668, + "step": 4090 + }, + { + "epoch": 2.12, + "learning_rate": 1.6360680818512144e-07, + "logits/chosen": -2.7612602710723877, + "logits/rejected": -2.825305461883545, + "logps/chosen": -223.1158905029297, + "logps/rejected": -373.63433837890625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.576636791229248, + "rewards/margins": 9.575021743774414, + "rewards/rejected": -8.998383522033691, + "step": 4100 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.7581655979156494, + "eval_logits/rejected": -2.824922800064087, + "eval_logps/chosen": -263.41705322265625, + "eval_logps/rejected": -346.3928527832031, + "eval_loss": 0.5714476108551025, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.573691725730896, + "eval_rewards/margins": 5.451850891113281, + "eval_rewards/rejected": -7.025542259216309, + "eval_runtime": 278.5529, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 4100 + }, + { + "epoch": 2.12, + "learning_rate": 1.6265060240963853e-07, + "logits/chosen": -2.7999181747436523, + "logits/rejected": -2.851715326309204, + "logps/chosen": -264.3846435546875, + "logps/rejected": -366.8013610839844, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5434740781784058, + "rewards/margins": 9.045372009277344, + "rewards/rejected": -8.501897811889648, + "step": 4110 + }, + { + "epoch": 2.13, + "learning_rate": 1.6169439663415565e-07, + "logits/chosen": -2.8644258975982666, + "logits/rejected": -2.865626811981201, + "logps/chosen": -227.87234497070312, + "logps/rejected": -376.9765625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4827271103858948, + "rewards/margins": 10.994937896728516, + "rewards/rejected": -10.512212753295898, + "step": 4120 + }, + { + "epoch": 2.13, + "learning_rate": 1.6073819085867276e-07, + "logits/chosen": -2.8559463024139404, + "logits/rejected": -2.920194387435913, + "logps/chosen": -225.81591796875, + "logps/rejected": -457.36187744140625, + "loss": 0.0152, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.435596227645874, + "rewards/margins": 12.244255065917969, + "rewards/rejected": -10.808659553527832, + "step": 4130 + }, + { + "epoch": 2.14, + "learning_rate": 1.597819850831899e-07, + "logits/chosen": -2.7609105110168457, + "logits/rejected": -2.801273822784424, + "logps/chosen": -204.47128295898438, + "logps/rejected": -378.7325134277344, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7425202131271362, + "rewards/margins": 10.559002876281738, + "rewards/rejected": -9.816482543945312, + "step": 4140 + }, + { + "epoch": 2.14, + "learning_rate": 1.5882577930770702e-07, + "logits/chosen": -2.828613758087158, + "logits/rejected": -2.861415386199951, + "logps/chosen": -261.1982116699219, + "logps/rejected": -361.2552490234375, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0972164124250412, + "rewards/margins": 9.664294242858887, + "rewards/rejected": -9.76151180267334, + "step": 4150 + }, + { + "epoch": 2.15, + "learning_rate": 1.5786957353222414e-07, + "logits/chosen": -2.8005096912384033, + "logits/rejected": -2.827037811279297, + "logps/chosen": -300.31878662109375, + "logps/rejected": -373.00482177734375, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3028424978256226, + "rewards/margins": 10.192333221435547, + "rewards/rejected": -8.889491081237793, + "step": 4160 + }, + { + "epoch": 2.15, + "learning_rate": 1.5691336775674125e-07, + "logits/chosen": -2.7364072799682617, + "logits/rejected": -2.775285482406616, + "logps/chosen": -246.3945770263672, + "logps/rejected": -368.32135009765625, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5547359585762024, + "rewards/margins": 10.201997756958008, + "rewards/rejected": -9.647260665893555, + "step": 4170 + }, + { + "epoch": 2.16, + "learning_rate": 1.5595716198125837e-07, + "logits/chosen": -2.736896276473999, + "logits/rejected": -2.791792631149292, + "logps/chosen": -255.3802947998047, + "logps/rejected": -392.916748046875, + "loss": 0.0175, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.13744010031223297, + "rewards/margins": 10.613411903381348, + "rewards/rejected": -10.475973129272461, + "step": 4180 + }, + { + "epoch": 2.16, + "learning_rate": 1.5500095620577546e-07, + "logits/chosen": -2.822958469390869, + "logits/rejected": -2.869006633758545, + "logps/chosen": -235.73965454101562, + "logps/rejected": -324.2436218261719, + "loss": 0.0186, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5705742239952087, + "rewards/margins": 10.561267852783203, + "rewards/rejected": -9.990694046020508, + "step": 4190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5404475043029257e-07, + "logits/chosen": -2.7899680137634277, + "logits/rejected": -2.828990936279297, + "logps/chosen": -236.803466796875, + "logps/rejected": -358.914306640625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1756969392299652, + "rewards/margins": 10.331613540649414, + "rewards/rejected": -10.50731086730957, + "step": 4200 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.740879535675049, + "eval_logits/rejected": -2.8101789951324463, + "eval_logps/chosen": -265.96771240234375, + "eval_logps/rejected": -348.9774475097656, + "eval_loss": 0.5547088384628296, + "eval_rewards/accuracies": 0.8019999861717224, + "eval_rewards/chosen": -1.8287551403045654, + "eval_rewards/margins": 5.455246448516846, + "eval_rewards/rejected": -7.284001350402832, + "eval_runtime": 279.0068, + "eval_samples_per_second": 7.168, + "eval_steps_per_second": 0.448, + "step": 4200 + }, + { + "epoch": 2.17, + "learning_rate": 1.5308854465480971e-07, + "logits/chosen": -2.7795817852020264, + "logits/rejected": -2.8416852951049805, + "logps/chosen": -246.7899627685547, + "logps/rejected": -322.27886962890625, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.713182806968689, + "rewards/margins": 9.562291145324707, + "rewards/rejected": -8.849108695983887, + "step": 4210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5213233887932683e-07, + "logits/chosen": -2.7340545654296875, + "logits/rejected": -2.7741708755493164, + "logps/chosen": -282.47357177734375, + "logps/rejected": -369.46942138671875, + "loss": 0.0072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5383526682853699, + "rewards/margins": 10.111784934997559, + "rewards/rejected": -9.573432922363281, + "step": 4220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5117613310384395e-07, + "logits/chosen": -2.7639198303222656, + "logits/rejected": -2.794893741607666, + "logps/chosen": -273.2432556152344, + "logps/rejected": -379.93414306640625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12266747653484344, + "rewards/margins": 11.038806915283203, + "rewards/rejected": -11.161474227905273, + "step": 4230 + }, + { + "epoch": 2.19, + "learning_rate": 1.5021992732836106e-07, + "logits/chosen": -2.776571750640869, + "logits/rejected": -2.8270602226257324, + "logps/chosen": -253.249755859375, + "logps/rejected": -416.44195556640625, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12639644742012024, + "rewards/margins": 10.179259300231934, + "rewards/rejected": -10.305657386779785, + "step": 4240 + }, + { + "epoch": 2.19, + "learning_rate": 1.4926372155287818e-07, + "logits/chosen": -2.804246187210083, + "logits/rejected": -2.8505008220672607, + "logps/chosen": -224.5175018310547, + "logps/rejected": -390.603515625, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.19647911190986633, + "rewards/margins": 11.195769309997559, + "rewards/rejected": -10.999292373657227, + "step": 4250 + }, + { + "epoch": 2.2, + "learning_rate": 1.483075157773953e-07, + "logits/chosen": -2.811629056930542, + "logits/rejected": -2.8690712451934814, + "logps/chosen": -249.8781280517578, + "logps/rejected": -350.0668640136719, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5592334866523743, + "rewards/margins": 10.175796508789062, + "rewards/rejected": -9.616562843322754, + "step": 4260 + }, + { + "epoch": 2.2, + "learning_rate": 1.4735131000191238e-07, + "logits/chosen": -2.768264055252075, + "logits/rejected": -2.839480400085449, + "logps/chosen": -203.34783935546875, + "logps/rejected": -307.28948974609375, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11550579220056534, + "rewards/margins": 10.141157150268555, + "rewards/rejected": -10.025650978088379, + "step": 4270 + }, + { + "epoch": 2.21, + "learning_rate": 1.4639510422642952e-07, + "logits/chosen": -2.7731645107269287, + "logits/rejected": -2.837803602218628, + "logps/chosen": -183.10386657714844, + "logps/rejected": -331.3976745605469, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4855395257472992, + "rewards/margins": 9.625699996948242, + "rewards/rejected": -9.140159606933594, + "step": 4280 + }, + { + "epoch": 2.21, + "learning_rate": 1.4543889845094664e-07, + "logits/chosen": -2.802046537399292, + "logits/rejected": -2.8697285652160645, + "logps/chosen": -333.67962646484375, + "logps/rejected": -438.260009765625, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7267153859138489, + "rewards/margins": 11.459893226623535, + "rewards/rejected": -10.733177185058594, + "step": 4290 + }, + { + "epoch": 2.22, + "learning_rate": 1.4448269267546376e-07, + "logits/chosen": -2.827871084213257, + "logits/rejected": -2.8422415256500244, + "logps/chosen": -322.83551025390625, + "logps/rejected": -417.2867736816406, + "loss": 0.0482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1672213077545166, + "rewards/margins": 12.557757377624512, + "rewards/rejected": -10.390536308288574, + "step": 4300 + }, + { + "epoch": 2.22, + "eval_logits/chosen": -2.787360429763794, + "eval_logits/rejected": -2.851278305053711, + "eval_logps/chosen": -259.26263427734375, + "eval_logps/rejected": -340.8786315917969, + "eval_loss": 0.5436837077140808, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.1582494974136353, + "eval_rewards/margins": 5.31587028503418, + "eval_rewards/rejected": -6.474120140075684, + "eval_runtime": 278.6515, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.449, + "step": 4300 + }, + { + "epoch": 2.23, + "learning_rate": 1.4352648689998087e-07, + "logits/chosen": -2.843700647354126, + "logits/rejected": -2.867222309112549, + "logps/chosen": -245.06906127929688, + "logps/rejected": -332.5545959472656, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0607502460479736, + "rewards/margins": 10.345534324645996, + "rewards/rejected": -9.284785270690918, + "step": 4310 + }, + { + "epoch": 2.23, + "learning_rate": 1.42570281124498e-07, + "logits/chosen": -2.7883362770080566, + "logits/rejected": -2.8541436195373535, + "logps/chosen": -284.0835876464844, + "logps/rejected": -381.5480041503906, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4265800714492798, + "rewards/margins": 10.460817337036133, + "rewards/rejected": -9.0342378616333, + "step": 4320 + }, + { + "epoch": 2.24, + "learning_rate": 1.416140753490151e-07, + "logits/chosen": -2.7982468605041504, + "logits/rejected": -2.883110523223877, + "logps/chosen": -281.6730041503906, + "logps/rejected": -414.79962158203125, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39604753255844116, + "rewards/margins": 11.227680206298828, + "rewards/rejected": -10.831633567810059, + "step": 4330 + }, + { + "epoch": 2.24, + "learning_rate": 1.4065786957353222e-07, + "logits/chosen": -2.815957546234131, + "logits/rejected": -2.8724205493927, + "logps/chosen": -259.5505065917969, + "logps/rejected": -376.6618347167969, + "loss": 0.0147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3497107028961182, + "rewards/margins": 10.574721336364746, + "rewards/rejected": -9.225010871887207, + "step": 4340 + }, + { + "epoch": 2.25, + "learning_rate": 1.3970166379804933e-07, + "logits/chosen": -2.804654598236084, + "logits/rejected": -2.8554275035858154, + "logps/chosen": -284.3072509765625, + "logps/rejected": -373.55084228515625, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5603520274162292, + "rewards/margins": 10.00316333770752, + "rewards/rejected": -9.442811012268066, + "step": 4350 + }, + { + "epoch": 2.25, + "learning_rate": 1.3874545802256645e-07, + "logits/chosen": -2.8638744354248047, + "logits/rejected": -2.8915674686431885, + "logps/chosen": -279.210205078125, + "logps/rejected": -344.73651123046875, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7013729810714722, + "rewards/margins": 10.824358940124512, + "rewards/rejected": -10.12298583984375, + "step": 4360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3778925224708357e-07, + "logits/chosen": -2.74699330329895, + "logits/rejected": -2.7918925285339355, + "logps/chosen": -247.57296752929688, + "logps/rejected": -379.8360900878906, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.507546067237854, + "rewards/margins": 10.973848342895508, + "rewards/rejected": -10.466302871704102, + "step": 4370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3683304647160068e-07, + "logits/chosen": -2.755742311477661, + "logits/rejected": -2.8171803951263428, + "logps/chosen": -208.4476776123047, + "logps/rejected": -373.80621337890625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6287239193916321, + "rewards/margins": 10.667415618896484, + "rewards/rejected": -10.038691520690918, + "step": 4380 + }, + { + "epoch": 2.27, + "learning_rate": 1.358768406961178e-07, + "logits/chosen": -2.808208703994751, + "logits/rejected": -2.864919662475586, + "logps/chosen": -214.84634399414062, + "logps/rejected": -361.06890869140625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06784908473491669, + "rewards/margins": 10.550009727478027, + "rewards/rejected": -10.482160568237305, + "step": 4390 + }, + { + "epoch": 2.27, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": -2.832120180130005, + "logits/rejected": -2.8752362728118896, + "logps/chosen": -162.49362182617188, + "logps/rejected": -339.9632568359375, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02236497402191162, + "rewards/margins": 11.080262184143066, + "rewards/rejected": -11.057897567749023, + "step": 4400 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.7835676670074463, + "eval_logits/rejected": -2.847372531890869, + "eval_logps/chosen": -263.640869140625, + "eval_logps/rejected": -347.76019287109375, + "eval_loss": 0.5489197969436646, + "eval_rewards/accuracies": 0.8100000023841858, + "eval_rewards/chosen": -1.5960688591003418, + "eval_rewards/margins": 5.56620979309082, + "eval_rewards/rejected": -7.16227912902832, + "eval_runtime": 278.2686, + "eval_samples_per_second": 7.187, + "eval_steps_per_second": 0.449, + "step": 4400 + }, + { + "epoch": 2.28, + "learning_rate": 1.3396442914515203e-07, + "logits/chosen": -2.7538981437683105, + "logits/rejected": -2.797692060470581, + "logps/chosen": -267.57086181640625, + "logps/rejected": -371.95587158203125, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.38313159346580505, + "rewards/margins": 9.685579299926758, + "rewards/rejected": -10.068711280822754, + "step": 4410 + }, + { + "epoch": 2.28, + "learning_rate": 1.3300822336966917e-07, + "logits/chosen": -2.8126959800720215, + "logits/rejected": -2.7908413410186768, + "logps/chosen": -300.4286193847656, + "logps/rejected": -410.2023010253906, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0054407119750977, + "rewards/margins": 11.864087104797363, + "rewards/rejected": -10.85864543914795, + "step": 4420 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205201759418626e-07, + "logits/chosen": -2.6961915493011475, + "logits/rejected": -2.730517625808716, + "logps/chosen": -290.7415771484375, + "logps/rejected": -405.54718017578125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3425076007843018, + "rewards/margins": 10.741376876831055, + "rewards/rejected": -9.398869514465332, + "step": 4430 + }, + { + "epoch": 2.29, + "learning_rate": 1.3109581181870338e-07, + "logits/chosen": -2.8597958087921143, + "logits/rejected": -2.8856372833251953, + "logps/chosen": -278.34490966796875, + "logps/rejected": -371.2419738769531, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2198222875595093, + "rewards/margins": 10.515420913696289, + "rewards/rejected": -9.295598983764648, + "step": 4440 + }, + { + "epoch": 2.3, + "learning_rate": 1.301396060432205e-07, + "logits/chosen": -2.7863852977752686, + "logits/rejected": -2.8537216186523438, + "logps/chosen": -253.5373992919922, + "logps/rejected": -363.5780334472656, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5466651916503906, + "rewards/margins": 11.057966232299805, + "rewards/rejected": -10.51130199432373, + "step": 4450 + }, + { + "epoch": 2.3, + "learning_rate": 1.291834002677376e-07, + "logits/chosen": -2.7333171367645264, + "logits/rejected": -2.7991576194763184, + "logps/chosen": -222.57424926757812, + "logps/rejected": -377.4979248046875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21776556968688965, + "rewards/margins": 10.281082153320312, + "rewards/rejected": -10.063316345214844, + "step": 4460 + }, + { + "epoch": 2.31, + "learning_rate": 1.2822719449225472e-07, + "logits/chosen": -2.763277530670166, + "logits/rejected": -2.8399055004119873, + "logps/chosen": -170.37014770507812, + "logps/rejected": -340.293212890625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.284454345703125, + "rewards/margins": 9.632649421691895, + "rewards/rejected": -9.91710376739502, + "step": 4470 + }, + { + "epoch": 2.31, + "learning_rate": 1.2727098871677184e-07, + "logits/chosen": -2.8263633251190186, + "logits/rejected": -2.8704276084899902, + "logps/chosen": -315.4873962402344, + "logps/rejected": -437.9556579589844, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6770858764648438, + "rewards/margins": 11.242297172546387, + "rewards/rejected": -10.565211296081543, + "step": 4480 + }, + { + "epoch": 2.32, + "learning_rate": 1.2631478294128898e-07, + "logits/chosen": -2.8527493476867676, + "logits/rejected": -2.885178565979004, + "logps/chosen": -229.99075317382812, + "logps/rejected": -385.3793029785156, + "loss": 0.0463, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.27444297075271606, + "rewards/margins": 10.513134956359863, + "rewards/rejected": -10.238691329956055, + "step": 4490 + }, + { + "epoch": 2.32, + "learning_rate": 1.253585771658061e-07, + "logits/chosen": -2.831853151321411, + "logits/rejected": -2.8576908111572266, + "logps/chosen": -238.5697021484375, + "logps/rejected": -388.0924377441406, + "loss": 0.1044, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5936632752418518, + "rewards/margins": 10.974578857421875, + "rewards/rejected": -10.380915641784668, + "step": 4500 + }, + { + "epoch": 2.32, + "eval_logits/chosen": -2.783907890319824, + "eval_logits/rejected": -2.848212957382202, + "eval_logps/chosen": -266.22772216796875, + "eval_logps/rejected": -353.6324768066406, + "eval_loss": 0.5818387269973755, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.8547568321228027, + "eval_rewards/margins": 5.894747734069824, + "eval_rewards/rejected": -7.749504089355469, + "eval_runtime": 278.5575, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 4500 + }, + { + "epoch": 2.33, + "learning_rate": 1.2440237139032319e-07, + "logits/chosen": -2.8777260780334473, + "logits/rejected": -2.902764320373535, + "logps/chosen": -277.8800964355469, + "logps/rejected": -350.5680236816406, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32009559869766235, + "rewards/margins": 10.575444221496582, + "rewards/rejected": -10.255348205566406, + "step": 4510 + }, + { + "epoch": 2.33, + "learning_rate": 1.234461656148403e-07, + "logits/chosen": -2.8503713607788086, + "logits/rejected": -2.90812087059021, + "logps/chosen": -303.7490539550781, + "logps/rejected": -406.6163635253906, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04932591691613197, + "rewards/margins": 11.66883659362793, + "rewards/rejected": -11.718160629272461, + "step": 4520 + }, + { + "epoch": 2.34, + "learning_rate": 1.2248995983935742e-07, + "logits/chosen": -2.844027042388916, + "logits/rejected": -2.8921597003936768, + "logps/chosen": -255.0517120361328, + "logps/rejected": -374.0168151855469, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9114421606063843, + "rewards/margins": 11.845464706420898, + "rewards/rejected": -9.934022903442383, + "step": 4530 + }, + { + "epoch": 2.34, + "learning_rate": 1.2153375406387456e-07, + "logits/chosen": -2.7948267459869385, + "logits/rejected": -2.8465352058410645, + "logps/chosen": -280.3050231933594, + "logps/rejected": -407.7957458496094, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20190688967704773, + "rewards/margins": 10.695255279541016, + "rewards/rejected": -10.493348121643066, + "step": 4540 + }, + { + "epoch": 2.35, + "learning_rate": 1.2057754828839165e-07, + "logits/chosen": -2.823227643966675, + "logits/rejected": -2.910917282104492, + "logps/chosen": -234.4201202392578, + "logps/rejected": -369.2624206542969, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06653478741645813, + "rewards/margins": 10.437435150146484, + "rewards/rejected": -10.37090015411377, + "step": 4550 + }, + { + "epoch": 2.35, + "learning_rate": 1.1962134251290876e-07, + "logits/chosen": -2.8018596172332764, + "logits/rejected": -2.8438680171966553, + "logps/chosen": -262.4857482910156, + "logps/rejected": -337.6358337402344, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25648975372314453, + "rewards/margins": 10.115443229675293, + "rewards/rejected": -10.371932983398438, + "step": 4560 + }, + { + "epoch": 2.36, + "learning_rate": 1.1866513673742588e-07, + "logits/chosen": -2.783787488937378, + "logits/rejected": -2.8495638370513916, + "logps/chosen": -283.16571044921875, + "logps/rejected": -342.12945556640625, + "loss": 0.0208, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.38057252764701843, + "rewards/margins": 10.06275749206543, + "rewards/rejected": -9.682184219360352, + "step": 4570 + }, + { + "epoch": 2.36, + "learning_rate": 1.1770893096194301e-07, + "logits/chosen": -2.7712864875793457, + "logits/rejected": -2.8020853996276855, + "logps/chosen": -262.7274475097656, + "logps/rejected": -374.31524658203125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.127746343612671, + "rewards/margins": 11.753847122192383, + "rewards/rejected": -10.62610149383545, + "step": 4580 + }, + { + "epoch": 2.37, + "learning_rate": 1.1675272518646012e-07, + "logits/chosen": -2.834930896759033, + "logits/rejected": -2.8962676525115967, + "logps/chosen": -204.0745086669922, + "logps/rejected": -364.3519592285156, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20914089679718018, + "rewards/margins": 9.767343521118164, + "rewards/rejected": -9.558202743530273, + "step": 4590 + }, + { + "epoch": 2.37, + "learning_rate": 1.1579651941097724e-07, + "logits/chosen": -2.8558883666992188, + "logits/rejected": -2.9146106243133545, + "logps/chosen": -232.75039672851562, + "logps/rejected": -366.9052429199219, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9267519116401672, + "rewards/margins": 10.588906288146973, + "rewards/rejected": -9.662155151367188, + "step": 4600 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.7866322994232178, + "eval_logits/rejected": -2.8511645793914795, + "eval_logps/chosen": -264.5919189453125, + "eval_logps/rejected": -351.72418212890625, + "eval_loss": 0.5813160538673401, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -1.6911762952804565, + "eval_rewards/margins": 5.867499828338623, + "eval_rewards/rejected": -7.558675765991211, + "eval_runtime": 278.7682, + "eval_samples_per_second": 7.174, + "eval_steps_per_second": 0.448, + "step": 4600 + }, + { + "epoch": 2.38, + "learning_rate": 1.1484031363549436e-07, + "logits/chosen": -2.7684288024902344, + "logits/rejected": -2.804677963256836, + "logps/chosen": -320.72589111328125, + "logps/rejected": -374.26654052734375, + "loss": 0.0165, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.35492923855781555, + "rewards/margins": 10.75482177734375, + "rewards/rejected": -10.399892807006836, + "step": 4610 + }, + { + "epoch": 2.39, + "learning_rate": 1.1388410786001147e-07, + "logits/chosen": -2.8026232719421387, + "logits/rejected": -2.85943341255188, + "logps/chosen": -211.3720703125, + "logps/rejected": -388.8536071777344, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15674147009849548, + "rewards/margins": 10.096702575683594, + "rewards/rejected": -10.253443717956543, + "step": 4620 + }, + { + "epoch": 2.39, + "learning_rate": 1.1292790208452859e-07, + "logits/chosen": -2.840369939804077, + "logits/rejected": -2.833326816558838, + "logps/chosen": -233.71926879882812, + "logps/rejected": -393.7159118652344, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027848612517118454, + "rewards/margins": 11.55732250213623, + "rewards/rejected": -11.585172653198242, + "step": 4630 + }, + { + "epoch": 2.4, + "learning_rate": 1.119716963090457e-07, + "logits/chosen": -2.794414758682251, + "logits/rejected": -2.8389816284179688, + "logps/chosen": -240.9445343017578, + "logps/rejected": -396.8078308105469, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32970330119132996, + "rewards/margins": 11.215761184692383, + "rewards/rejected": -11.545463562011719, + "step": 4640 + }, + { + "epoch": 2.4, + "learning_rate": 1.1101549053356282e-07, + "logits/chosen": -2.7997448444366455, + "logits/rejected": -2.8671469688415527, + "logps/chosen": -243.86978149414062, + "logps/rejected": -387.15045166015625, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.43554702401161194, + "rewards/margins": 11.55484390258789, + "rewards/rejected": -11.11929702758789, + "step": 4650 + }, + { + "epoch": 2.41, + "learning_rate": 1.1005928475807993e-07, + "logits/chosen": -2.7641167640686035, + "logits/rejected": -2.822042942047119, + "logps/chosen": -204.0389862060547, + "logps/rejected": -362.5557556152344, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1727055460214615, + "rewards/margins": 11.18034553527832, + "rewards/rejected": -11.007640838623047, + "step": 4660 + }, + { + "epoch": 2.41, + "learning_rate": 1.0910307898259705e-07, + "logits/chosen": -2.724806308746338, + "logits/rejected": -2.793595314025879, + "logps/chosen": -233.08151245117188, + "logps/rejected": -413.00274658203125, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1520412564277649, + "rewards/margins": 10.408720016479492, + "rewards/rejected": -10.560762405395508, + "step": 4670 + }, + { + "epoch": 2.42, + "learning_rate": 1.0814687320711418e-07, + "logits/chosen": -2.6983354091644287, + "logits/rejected": -2.76259446144104, + "logps/chosen": -199.3309326171875, + "logps/rejected": -420.94757080078125, + "loss": 0.0096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.28074705600738525, + "rewards/margins": 11.806534767150879, + "rewards/rejected": -11.525787353515625, + "step": 4680 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719066743163128e-07, + "logits/chosen": -2.7496988773345947, + "logits/rejected": -2.8320717811584473, + "logps/chosen": -302.5816345214844, + "logps/rejected": -407.7145690917969, + "loss": 0.0166, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.16056537628173828, + "rewards/margins": 11.588438034057617, + "rewards/rejected": -11.749003410339355, + "step": 4690 + }, + { + "epoch": 2.43, + "learning_rate": 1.062344616561484e-07, + "logits/chosen": -2.8104662895202637, + "logits/rejected": -2.8494856357574463, + "logps/chosen": -302.8609313964844, + "logps/rejected": -400.6754150390625, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10702119022607803, + "rewards/margins": 12.200922966003418, + "rewards/rejected": -12.093901634216309, + "step": 4700 + }, + { + "epoch": 2.43, + "eval_logits/chosen": -2.75579571723938, + "eval_logits/rejected": -2.8210208415985107, + "eval_logps/chosen": -270.0638732910156, + "eval_logps/rejected": -359.8251647949219, + "eval_loss": 0.605195164680481, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": -2.238370180130005, + "eval_rewards/margins": 6.130407810211182, + "eval_rewards/rejected": -8.36877727508545, + "eval_runtime": 278.3561, + "eval_samples_per_second": 7.185, + "eval_steps_per_second": 0.449, + "step": 4700 + }, + { + "epoch": 2.43, + "learning_rate": 1.0527825588066551e-07, + "logits/chosen": -2.7616004943847656, + "logits/rejected": -2.8404548168182373, + "logps/chosen": -256.90576171875, + "logps/rejected": -385.58367919921875, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.34221506118774414, + "rewards/margins": 12.297042846679688, + "rewards/rejected": -11.954826354980469, + "step": 4710 + }, + { + "epoch": 2.44, + "learning_rate": 1.0432205010518264e-07, + "logits/chosen": -2.779740333557129, + "logits/rejected": -2.840048313140869, + "logps/chosen": -210.28982543945312, + "logps/rejected": -393.116943359375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.500928521156311, + "rewards/margins": 11.802443504333496, + "rewards/rejected": -11.301515579223633, + "step": 4720 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336584432969974e-07, + "logits/chosen": -2.7029783725738525, + "logits/rejected": -2.738452434539795, + "logps/chosen": -209.94808959960938, + "logps/rejected": -359.03863525390625, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2978671193122864, + "rewards/margins": 9.817428588867188, + "rewards/rejected": -9.519559860229492, + "step": 4730 + }, + { + "epoch": 2.45, + "learning_rate": 1.0240963855421686e-07, + "logits/chosen": -2.6916146278381348, + "logits/rejected": -2.7506518363952637, + "logps/chosen": -329.70587158203125, + "logps/rejected": -386.9439697265625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8479889035224915, + "rewards/margins": 10.828804016113281, + "rewards/rejected": -9.980814933776855, + "step": 4740 + }, + { + "epoch": 2.45, + "learning_rate": 1.0145343277873399e-07, + "logits/chosen": -2.7057933807373047, + "logits/rejected": -2.7978127002716064, + "logps/chosen": -277.19329833984375, + "logps/rejected": -393.6683044433594, + "loss": 0.0159, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7689875364303589, + "rewards/margins": 11.254928588867188, + "rewards/rejected": -10.485939979553223, + "step": 4750 + }, + { + "epoch": 2.46, + "learning_rate": 1.004972270032511e-07, + "logits/chosen": -2.6612842082977295, + "logits/rejected": -2.6679983139038086, + "logps/chosen": -275.8064270019531, + "logps/rejected": -327.3836364746094, + "loss": 0.0302, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3787831664085388, + "rewards/margins": 10.829161643981934, + "rewards/rejected": -11.207944869995117, + "step": 4760 + }, + { + "epoch": 2.46, + "learning_rate": 9.95410212277682e-08, + "logits/chosen": -2.767195463180542, + "logits/rejected": -2.8072774410247803, + "logps/chosen": -275.5713806152344, + "logps/rejected": -398.08782958984375, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25600147247314453, + "rewards/margins": 10.520998001098633, + "rewards/rejected": -10.77700138092041, + "step": 4770 + }, + { + "epoch": 2.47, + "learning_rate": 9.858481545228532e-08, + "logits/chosen": -2.7905125617980957, + "logits/rejected": -2.7901828289031982, + "logps/chosen": -246.63400268554688, + "logps/rejected": -346.6756896972656, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.0959916040301323, + "rewards/margins": 10.186857223510742, + "rewards/rejected": -10.28284740447998, + "step": 4780 + }, + { + "epoch": 2.47, + "learning_rate": 9.762860967680245e-08, + "logits/chosen": -2.765639543533325, + "logits/rejected": -2.750558614730835, + "logps/chosen": -260.526123046875, + "logps/rejected": -423.7069396972656, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6589454412460327, + "rewards/margins": 11.203248977661133, + "rewards/rejected": -11.86219596862793, + "step": 4790 + }, + { + "epoch": 2.48, + "learning_rate": 9.667240390131957e-08, + "logits/chosen": -2.8430063724517822, + "logits/rejected": -2.862946033477783, + "logps/chosen": -288.14813232421875, + "logps/rejected": -384.7672119140625, + "loss": 0.0636, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6992109417915344, + "rewards/margins": 11.617467880249023, + "rewards/rejected": -10.918257713317871, + "step": 4800 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.779690742492676, + "eval_logits/rejected": -2.8455140590667725, + "eval_logps/chosen": -266.16302490234375, + "eval_logps/rejected": -353.9501953125, + "eval_loss": 0.5866954326629639, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.848286747932434, + "eval_rewards/margins": 5.932989120483398, + "eval_rewards/rejected": -7.781275272369385, + "eval_runtime": 278.9143, + "eval_samples_per_second": 7.171, + "eval_steps_per_second": 0.448, + "step": 4800 + }, + { + "epoch": 2.48, + "learning_rate": 9.571619812583667e-08, + "logits/chosen": -2.7728781700134277, + "logits/rejected": -2.819183349609375, + "logps/chosen": -268.3041076660156, + "logps/rejected": -418.9088439941406, + "loss": 0.0139, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.47220858931541443, + "rewards/margins": 12.578226089477539, + "rewards/rejected": -12.10601806640625, + "step": 4810 + }, + { + "epoch": 2.49, + "learning_rate": 9.47599923503538e-08, + "logits/chosen": -2.7985777854919434, + "logits/rejected": -2.8246827125549316, + "logps/chosen": -248.9866485595703, + "logps/rejected": -388.82525634765625, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.21932640671730042, + "rewards/margins": 10.778423309326172, + "rewards/rejected": -10.55909538269043, + "step": 4820 + }, + { + "epoch": 2.49, + "learning_rate": 9.380378657487091e-08, + "logits/chosen": -2.8599352836608887, + "logits/rejected": -2.8889236450195312, + "logps/chosen": -281.4698486328125, + "logps/rejected": -385.79595947265625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5898095965385437, + "rewards/margins": 12.167850494384766, + "rewards/rejected": -11.578041076660156, + "step": 4830 + }, + { + "epoch": 2.5, + "learning_rate": 9.284758079938803e-08, + "logits/chosen": -2.807657241821289, + "logits/rejected": -2.8112683296203613, + "logps/chosen": -304.69256591796875, + "logps/rejected": -374.6174621582031, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.626765787601471, + "rewards/margins": 11.146921157836914, + "rewards/rejected": -10.520155906677246, + "step": 4840 + }, + { + "epoch": 2.5, + "learning_rate": 9.189137502390513e-08, + "logits/chosen": -2.855536699295044, + "logits/rejected": -2.8657050132751465, + "logps/chosen": -270.238037109375, + "logps/rejected": -440.15814208984375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2958551049232483, + "rewards/margins": 10.37101936340332, + "rewards/rejected": -10.66687297821045, + "step": 4850 + }, + { + "epoch": 2.51, + "learning_rate": 9.093516924842226e-08, + "logits/chosen": -2.802277088165283, + "logits/rejected": -2.852307081222534, + "logps/chosen": -238.21304321289062, + "logps/rejected": -415.5626525878906, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5378857254981995, + "rewards/margins": 11.27935791015625, + "rewards/rejected": -10.741472244262695, + "step": 4860 + }, + { + "epoch": 2.51, + "learning_rate": 8.997896347293938e-08, + "logits/chosen": -2.8001160621643066, + "logits/rejected": -2.879955768585205, + "logps/chosen": -195.65481567382812, + "logps/rejected": -427.76470947265625, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1122699975967407, + "rewards/margins": 11.812494277954102, + "rewards/rejected": -10.700222969055176, + "step": 4870 + }, + { + "epoch": 2.52, + "learning_rate": 8.902275769745648e-08, + "logits/chosen": -2.8066625595092773, + "logits/rejected": -2.8681557178497314, + "logps/chosen": -243.08468627929688, + "logps/rejected": -397.8512268066406, + "loss": 0.0119, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1370861530303955, + "rewards/margins": 11.573083877563477, + "rewards/rejected": -11.710169792175293, + "step": 4880 + }, + { + "epoch": 2.52, + "learning_rate": 8.806655192197361e-08, + "logits/chosen": -2.777631998062134, + "logits/rejected": -2.854902744293213, + "logps/chosen": -234.71383666992188, + "logps/rejected": -398.0288391113281, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.32972007989883423, + "rewards/margins": 11.04564094543457, + "rewards/rejected": -11.375360488891602, + "step": 4890 + }, + { + "epoch": 2.53, + "learning_rate": 8.711034614649072e-08, + "logits/chosen": -2.842499256134033, + "logits/rejected": -2.846536874771118, + "logps/chosen": -236.6642303466797, + "logps/rejected": -412.7945861816406, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5801752805709839, + "rewards/margins": 12.802286148071289, + "rewards/rejected": -12.2221097946167, + "step": 4900 + }, + { + "epoch": 2.53, + "eval_logits/chosen": -2.768664598464966, + "eval_logits/rejected": -2.8342463970184326, + "eval_logps/chosen": -266.76190185546875, + "eval_logps/rejected": -354.1345520019531, + "eval_loss": 0.5878357291221619, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.9081742763519287, + "eval_rewards/margins": 5.891535758972168, + "eval_rewards/rejected": -7.799710273742676, + "eval_runtime": 278.4252, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.449, + "step": 4900 + }, + { + "epoch": 2.53, + "learning_rate": 8.615414037100784e-08, + "logits/chosen": -2.744393825531006, + "logits/rejected": -2.792158603668213, + "logps/chosen": -311.90576171875, + "logps/rejected": -381.0210266113281, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9963443875312805, + "rewards/margins": 11.411710739135742, + "rewards/rejected": -10.415367126464844, + "step": 4910 + }, + { + "epoch": 2.54, + "learning_rate": 8.519793459552494e-08, + "logits/chosen": -2.7664597034454346, + "logits/rejected": -2.8189749717712402, + "logps/chosen": -216.3651123046875, + "logps/rejected": -382.20184326171875, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7522561550140381, + "rewards/margins": 11.970437049865723, + "rewards/rejected": -11.218182563781738, + "step": 4920 + }, + { + "epoch": 2.55, + "learning_rate": 8.424172882004207e-08, + "logits/chosen": -2.766000270843506, + "logits/rejected": -2.815253257751465, + "logps/chosen": -247.57177734375, + "logps/rejected": -414.02117919921875, + "loss": 0.0223, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.35223907232284546, + "rewards/margins": 9.96684455871582, + "rewards/rejected": -9.614606857299805, + "step": 4930 + }, + { + "epoch": 2.55, + "learning_rate": 8.328552304455919e-08, + "logits/chosen": -2.8126516342163086, + "logits/rejected": -2.8359436988830566, + "logps/chosen": -246.8765106201172, + "logps/rejected": -343.9100036621094, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.468789666891098, + "rewards/margins": 10.000219345092773, + "rewards/rejected": -9.5314302444458, + "step": 4940 + }, + { + "epoch": 2.56, + "learning_rate": 8.23293172690763e-08, + "logits/chosen": -2.7989373207092285, + "logits/rejected": -2.830984115600586, + "logps/chosen": -271.40362548828125, + "logps/rejected": -391.43890380859375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7524517178535461, + "rewards/margins": 11.619732856750488, + "rewards/rejected": -10.867280960083008, + "step": 4950 + }, + { + "epoch": 2.56, + "learning_rate": 8.137311149359343e-08, + "logits/chosen": -2.868978977203369, + "logits/rejected": -2.8965516090393066, + "logps/chosen": -297.94769287109375, + "logps/rejected": -374.5810241699219, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8119597434997559, + "rewards/margins": 11.822549819946289, + "rewards/rejected": -11.010589599609375, + "step": 4960 + }, + { + "epoch": 2.57, + "learning_rate": 8.041690571811053e-08, + "logits/chosen": -2.8357903957366943, + "logits/rejected": -2.8867337703704834, + "logps/chosen": -277.58050537109375, + "logps/rejected": -391.5204772949219, + "loss": 0.0156, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.00696375360712409, + "rewards/margins": 11.595417022705078, + "rewards/rejected": -11.602380752563477, + "step": 4970 + }, + { + "epoch": 2.57, + "learning_rate": 7.946069994262765e-08, + "logits/chosen": -2.7701573371887207, + "logits/rejected": -2.7922308444976807, + "logps/chosen": -273.548583984375, + "logps/rejected": -363.3747863769531, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.018690502271056175, + "rewards/margins": 9.98866081237793, + "rewards/rejected": -9.969969749450684, + "step": 4980 + }, + { + "epoch": 2.58, + "learning_rate": 7.850449416714476e-08, + "logits/chosen": -2.7947163581848145, + "logits/rejected": -2.8349592685699463, + "logps/chosen": -271.5372009277344, + "logps/rejected": -382.420654296875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37151023745536804, + "rewards/margins": 10.325210571289062, + "rewards/rejected": -9.953700065612793, + "step": 4990 + }, + { + "epoch": 2.58, + "learning_rate": 7.754828839166188e-08, + "logits/chosen": -2.783332586288452, + "logits/rejected": -2.8476181030273438, + "logps/chosen": -250.2198486328125, + "logps/rejected": -405.2004699707031, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36954087018966675, + "rewards/margins": 11.464941024780273, + "rewards/rejected": -11.834482192993164, + "step": 5000 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.7497642040252686, + "eval_logits/rejected": -2.814385175704956, + "eval_logps/chosen": -269.3044738769531, + "eval_logps/rejected": -358.25360107421875, + "eval_loss": 0.5969280004501343, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -2.162431240081787, + "eval_rewards/margins": 6.049188137054443, + "eval_rewards/rejected": -8.211620330810547, + "eval_runtime": 278.5346, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 5000 + }, + { + "epoch": 2.59, + "learning_rate": 7.6592082616179e-08, + "logits/chosen": -2.7811551094055176, + "logits/rejected": -2.8356614112854004, + "logps/chosen": -245.42977905273438, + "logps/rejected": -425.71038818359375, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.30897045135498047, + "rewards/margins": 11.47396183013916, + "rewards/rejected": -11.78293228149414, + "step": 5010 + }, + { + "epoch": 2.59, + "learning_rate": 7.563587684069611e-08, + "logits/chosen": -2.7130038738250732, + "logits/rejected": -2.8152432441711426, + "logps/chosen": -284.6722717285156, + "logps/rejected": -402.2973937988281, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4836142063140869, + "rewards/margins": 11.877161979675293, + "rewards/rejected": -11.393548965454102, + "step": 5020 + }, + { + "epoch": 2.6, + "learning_rate": 7.467967106521324e-08, + "logits/chosen": -2.779438018798828, + "logits/rejected": -2.834465265274048, + "logps/chosen": -247.9782257080078, + "logps/rejected": -290.38848876953125, + "loss": 0.0119, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.46960878372192383, + "rewards/margins": 9.75239372253418, + "rewards/rejected": -10.222002983093262, + "step": 5030 + }, + { + "epoch": 2.6, + "learning_rate": 7.372346528973034e-08, + "logits/chosen": -2.828298807144165, + "logits/rejected": -2.8650214672088623, + "logps/chosen": -255.68765258789062, + "logps/rejected": -359.00189208984375, + "loss": 0.0169, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4944925308227539, + "rewards/margins": 11.243809700012207, + "rewards/rejected": -10.749317169189453, + "step": 5040 + }, + { + "epoch": 2.61, + "learning_rate": 7.276725951424746e-08, + "logits/chosen": -2.7610697746276855, + "logits/rejected": -2.795448064804077, + "logps/chosen": -215.3785400390625, + "logps/rejected": -383.4140930175781, + "loss": 0.0185, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4611515998840332, + "rewards/margins": 10.838452339172363, + "rewards/rejected": -11.299602508544922, + "step": 5050 + }, + { + "epoch": 2.61, + "learning_rate": 7.181105373876457e-08, + "logits/chosen": -2.779067277908325, + "logits/rejected": -2.810563802719116, + "logps/chosen": -245.2666778564453, + "logps/rejected": -347.674072265625, + "loss": 0.0127, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.05352171137928963, + "rewards/margins": 10.571831703186035, + "rewards/rejected": -10.518308639526367, + "step": 5060 + }, + { + "epoch": 2.62, + "learning_rate": 7.08548479632817e-08, + "logits/chosen": -2.7645630836486816, + "logits/rejected": -2.807478427886963, + "logps/chosen": -301.7003173828125, + "logps/rejected": -394.48809814453125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8143863677978516, + "rewards/margins": 12.549185752868652, + "rewards/rejected": -11.734800338745117, + "step": 5070 + }, + { + "epoch": 2.62, + "learning_rate": 6.98986421877988e-08, + "logits/chosen": -2.80440616607666, + "logits/rejected": -2.832104206085205, + "logps/chosen": -273.5317077636719, + "logps/rejected": -410.95965576171875, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.07825110107660294, + "rewards/margins": 10.619649887084961, + "rewards/rejected": -10.541399955749512, + "step": 5080 + }, + { + "epoch": 2.63, + "learning_rate": 6.894243641231592e-08, + "logits/chosen": -2.7227680683135986, + "logits/rejected": -2.7704923152923584, + "logps/chosen": -265.0140075683594, + "logps/rejected": -402.3088073730469, + "loss": 0.0104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.47135013341903687, + "rewards/margins": 12.040913581848145, + "rewards/rejected": -11.569562911987305, + "step": 5090 + }, + { + "epoch": 2.63, + "learning_rate": 6.798623063683305e-08, + "logits/chosen": -2.719788074493408, + "logits/rejected": -2.770781993865967, + "logps/chosen": -280.3219299316406, + "logps/rejected": -383.2566833496094, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07484505325555801, + "rewards/margins": 12.587934494018555, + "rewards/rejected": -12.6627779006958, + "step": 5100 + }, + { + "epoch": 2.63, + "eval_logits/chosen": -2.755725145339966, + "eval_logits/rejected": -2.8196589946746826, + "eval_logps/chosen": -269.3545837402344, + "eval_logps/rejected": -358.35565185546875, + "eval_loss": 0.6007997989654541, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -2.16744327545166, + "eval_rewards/margins": 6.054382801055908, + "eval_rewards/rejected": -8.221826553344727, + "eval_runtime": 278.6404, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.449, + "step": 5100 + }, + { + "epoch": 2.64, + "learning_rate": 6.703002486135017e-08, + "logits/chosen": -2.7945752143859863, + "logits/rejected": -2.8086471557617188, + "logps/chosen": -247.31655883789062, + "logps/rejected": -398.75177001953125, + "loss": 0.0186, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4659408628940582, + "rewards/margins": 11.80299186706543, + "rewards/rejected": -12.268933296203613, + "step": 5110 + }, + { + "epoch": 2.64, + "learning_rate": 6.607381908586727e-08, + "logits/chosen": -2.809502601623535, + "logits/rejected": -2.7968926429748535, + "logps/chosen": -258.90118408203125, + "logps/rejected": -439.64849853515625, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.32113468647003174, + "rewards/margins": 11.847344398498535, + "rewards/rejected": -11.526209831237793, + "step": 5120 + }, + { + "epoch": 2.65, + "learning_rate": 6.511761331038438e-08, + "logits/chosen": -2.8286375999450684, + "logits/rejected": -2.859506130218506, + "logps/chosen": -230.7789764404297, + "logps/rejected": -376.81695556640625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6774338483810425, + "rewards/margins": 10.802311897277832, + "rewards/rejected": -11.479743957519531, + "step": 5130 + }, + { + "epoch": 2.65, + "learning_rate": 6.416140753490151e-08, + "logits/chosen": -2.7911789417266846, + "logits/rejected": -2.8675835132598877, + "logps/chosen": -271.16534423828125, + "logps/rejected": -420.4945373535156, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7790706753730774, + "rewards/margins": 11.275418281555176, + "rewards/rejected": -10.49634838104248, + "step": 5140 + }, + { + "epoch": 2.66, + "learning_rate": 6.320520175941863e-08, + "logits/chosen": -2.7352991104125977, + "logits/rejected": -2.7836477756500244, + "logps/chosen": -245.20999145507812, + "logps/rejected": -382.8066101074219, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1987796127796173, + "rewards/margins": 11.559615135192871, + "rewards/rejected": -11.758394241333008, + "step": 5150 + }, + { + "epoch": 2.66, + "learning_rate": 6.224899598393573e-08, + "logits/chosen": -2.826261281967163, + "logits/rejected": -2.8705601692199707, + "logps/chosen": -322.09613037109375, + "logps/rejected": -440.3390197753906, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9744750261306763, + "rewards/margins": 12.657623291015625, + "rewards/rejected": -11.683148384094238, + "step": 5160 + }, + { + "epoch": 2.67, + "learning_rate": 6.129279020845286e-08, + "logits/chosen": -2.837642192840576, + "logits/rejected": -2.821343421936035, + "logps/chosen": -219.88204956054688, + "logps/rejected": -346.25341796875, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9648548364639282, + "rewards/margins": 12.348124504089355, + "rewards/rejected": -11.383270263671875, + "step": 5170 + }, + { + "epoch": 2.67, + "learning_rate": 6.033658443296998e-08, + "logits/chosen": -2.808182716369629, + "logits/rejected": -2.8574142456054688, + "logps/chosen": -260.39801025390625, + "logps/rejected": -390.830322265625, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6420486569404602, + "rewards/margins": 11.35422420501709, + "rewards/rejected": -10.712176322937012, + "step": 5180 + }, + { + "epoch": 2.68, + "learning_rate": 5.9380378657487085e-08, + "logits/chosen": -2.784289836883545, + "logits/rejected": -2.85602068901062, + "logps/chosen": -277.7592468261719, + "logps/rejected": -387.52587890625, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.17926214635372162, + "rewards/margins": 11.136748313903809, + "rewards/rejected": -10.957486152648926, + "step": 5190 + }, + { + "epoch": 2.68, + "learning_rate": 5.842417288200421e-08, + "logits/chosen": -2.789968729019165, + "logits/rejected": -2.866093158721924, + "logps/chosen": -285.8483581542969, + "logps/rejected": -409.1543273925781, + "loss": 0.0103, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.21996426582336426, + "rewards/margins": 11.31649112701416, + "rewards/rejected": -11.096527099609375, + "step": 5200 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.7545626163482666, + "eval_logits/rejected": -2.8181042671203613, + "eval_logps/chosen": -271.59014892578125, + "eval_logps/rejected": -362.2855529785156, + "eval_loss": 0.621418833732605, + "eval_rewards/accuracies": 0.8059999942779541, + "eval_rewards/chosen": -2.391003370285034, + "eval_rewards/margins": 6.22381067276001, + "eval_rewards/rejected": -8.614813804626465, + "eval_runtime": 278.4224, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.449, + "step": 5200 + }, + { + "epoch": 2.69, + "learning_rate": 5.7467967106521317e-08, + "logits/chosen": -2.7700493335723877, + "logits/rejected": -2.8540453910827637, + "logps/chosen": -192.63064575195312, + "logps/rejected": -372.6647644042969, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06132305786013603, + "rewards/margins": 11.770467758178711, + "rewards/rejected": -11.831789016723633, + "step": 5210 + }, + { + "epoch": 2.69, + "learning_rate": 5.651176133103844e-08, + "logits/chosen": -2.817718029022217, + "logits/rejected": -2.8606326580047607, + "logps/chosen": -231.86410522460938, + "logps/rejected": -427.46014404296875, + "loss": 0.0143, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.520971417427063, + "rewards/margins": 13.073326110839844, + "rewards/rejected": -12.55235481262207, + "step": 5220 + }, + { + "epoch": 2.7, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -2.744419574737549, + "logits/rejected": -2.8055806159973145, + "logps/chosen": -279.8837890625, + "logps/rejected": -321.3486022949219, + "loss": 0.0248, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.1986587941646576, + "rewards/margins": 10.663652420043945, + "rewards/rejected": -10.862310409545898, + "step": 5230 + }, + { + "epoch": 2.71, + "learning_rate": 5.459934978007267e-08, + "logits/chosen": -2.812368631362915, + "logits/rejected": -2.8478336334228516, + "logps/chosen": -265.64459228515625, + "logps/rejected": -392.9212951660156, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24865750968456268, + "rewards/margins": 12.445100784301758, + "rewards/rejected": -12.196441650390625, + "step": 5240 + }, + { + "epoch": 2.71, + "learning_rate": 5.3643144004589786e-08, + "logits/chosen": -2.753692150115967, + "logits/rejected": -2.7644002437591553, + "logps/chosen": -251.43661499023438, + "logps/rejected": -412.4212341308594, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4879121780395508, + "rewards/margins": 13.612544059753418, + "rewards/rejected": -12.124631881713867, + "step": 5250 + }, + { + "epoch": 2.72, + "learning_rate": 5.26869382291069e-08, + "logits/chosen": -2.7969448566436768, + "logits/rejected": -2.845008373260498, + "logps/chosen": -232.2137908935547, + "logps/rejected": -344.79449462890625, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3992539942264557, + "rewards/margins": 10.323373794555664, + "rewards/rejected": -10.722628593444824, + "step": 5260 + }, + { + "epoch": 2.72, + "learning_rate": 5.173073245362402e-08, + "logits/chosen": -2.737816572189331, + "logits/rejected": -2.803926706314087, + "logps/chosen": -263.85369873046875, + "logps/rejected": -402.1507263183594, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2205667942762375, + "rewards/margins": 11.85888671875, + "rewards/rejected": -12.079452514648438, + "step": 5270 }, { "epoch": 2.73, - "learning_rate": 5.05166475315729e-08, - "logits/chosen": -2.7941179275512695, - "logits/rejected": -2.777252674102783, - "logps/chosen": -359.3582763671875, - "logps/rejected": -361.58026123046875, - "loss": 0.0118, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.0606005191802979, - "rewards/margins": 8.198938369750977, - "rewards/rejected": -9.259538650512695, - "step": 2640 + "learning_rate": 5.077452667814113e-08, + "logits/chosen": -2.7733945846557617, + "logits/rejected": -2.7655272483825684, + "logps/chosen": -239.1111297607422, + "logps/rejected": -379.08404541015625, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.969031810760498, + "rewards/margins": 11.643916130065918, + "rewards/rejected": -12.612947463989258, + "step": 5280 + }, + { + "epoch": 2.73, + "learning_rate": 4.981832090265825e-08, + "logits/chosen": -2.727570056915283, + "logits/rejected": -2.7912638187408447, + "logps/chosen": -214.1581573486328, + "logps/rejected": -376.3984375, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.15673276782035828, + "rewards/margins": 11.739975929260254, + "rewards/rejected": -11.896708488464355, + "step": 5290 }, { "epoch": 2.74, - "learning_rate": 4.860313815537696e-08, - "logits/chosen": -2.8406758308410645, - "logits/rejected": -2.7846803665161133, - "logps/chosen": -351.44287109375, - "logps/rejected": -380.54876708984375, - "loss": 0.0231, + "learning_rate": 4.8862115127175364e-08, + "logits/chosen": -2.8189170360565186, + "logits/rejected": -2.8516533374786377, + "logps/chosen": -282.37646484375, + "logps/rejected": -383.95074462890625, + "loss": 0.0035, "rewards/accuracies": 1.0, - "rewards/chosen": -0.5040115714073181, - "rewards/margins": 8.634989738464355, - "rewards/rejected": -9.13900089263916, - "step": 2650 + "rewards/chosen": 0.01846367120742798, + "rewards/margins": 11.932182312011719, + "rewards/rejected": -11.913717269897461, + "step": 5300 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -2.7435574531555176, + "eval_logits/rejected": -2.804840326309204, + "eval_logps/chosen": -270.68603515625, + "eval_logps/rejected": -360.4677429199219, + "eval_loss": 0.6089810132980347, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -2.3005878925323486, + "eval_rewards/margins": 6.132449150085449, + "eval_rewards/rejected": -8.433036804199219, + "eval_runtime": 278.4607, + "eval_samples_per_second": 7.182, + "eval_steps_per_second": 0.449, + "step": 5300 + }, + { + "epoch": 2.74, + "learning_rate": 4.790590935169248e-08, + "logits/chosen": -2.776038646697998, + "logits/rejected": -2.8061070442199707, + "logps/chosen": -217.2928924560547, + "logps/rejected": -490.9940490722656, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.34224528074264526, + "rewards/margins": 11.75390911102295, + "rewards/rejected": -11.411664962768555, + "step": 5310 }, { "epoch": 2.75, - "learning_rate": 4.668962877918101e-08, - "logits/chosen": -2.7726569175720215, - "logits/rejected": -2.7721505165100098, - "logps/chosen": -340.0118103027344, - "logps/rejected": -435.3929748535156, - "loss": 0.0142, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.6642410755157471, - "rewards/margins": 9.08407211303711, - "rewards/rejected": -9.748313903808594, - "step": 2660 + "learning_rate": 4.69497035762096e-08, + "logits/chosen": -2.762821674346924, + "logits/rejected": -2.774388313293457, + "logps/chosen": -233.3072509765625, + "logps/rejected": -377.55517578125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8804025650024414, + "rewards/margins": 12.22156810760498, + "rewards/rejected": -13.101969718933105, + "step": 5320 + }, + { + "epoch": 2.75, + "learning_rate": 4.599349780072671e-08, + "logits/chosen": -2.7343251705169678, + "logits/rejected": -2.776947498321533, + "logps/chosen": -225.74252319335938, + "logps/rejected": -361.77642822265625, + "loss": 0.009, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6561448574066162, + "rewards/margins": 12.739583969116211, + "rewards/rejected": -12.083440780639648, + "step": 5330 }, { "epoch": 2.76, - "learning_rate": 4.477611940298507e-08, - "logits/chosen": -2.775136947631836, - "logits/rejected": -2.7988369464874268, - "logps/chosen": -354.5794677734375, - "logps/rejected": -364.4373474121094, - "loss": 0.0159, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.16665682196617126, - "rewards/margins": 8.789073944091797, - "rewards/rejected": -8.955730438232422, - "step": 2670 + "learning_rate": 4.5037292025243834e-08, + "logits/chosen": -2.7473442554473877, + "logits/rejected": -2.801457166671753, + "logps/chosen": -312.99407958984375, + "logps/rejected": -443.25592041015625, + "loss": 0.0175, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4658606946468353, + "rewards/margins": 10.478414535522461, + "rewards/rejected": -10.944275856018066, + "step": 5340 + }, + { + "epoch": 2.76, + "learning_rate": 4.408108624976094e-08, + "logits/chosen": -2.8104164600372314, + "logits/rejected": -2.8661699295043945, + "logps/chosen": -202.13577270507812, + "logps/rejected": -429.741455078125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3201662302017212, + "rewards/margins": 14.21458911895752, + "rewards/rejected": -12.89442253112793, + "step": 5350 }, { "epoch": 2.77, - "learning_rate": 4.2862610026789124e-08, - "logits/chosen": -2.7736451625823975, - "logits/rejected": -2.815216064453125, - "logps/chosen": -350.3184509277344, - "logps/rejected": -368.71844482421875, - "loss": 0.0162, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.13622841238975525, - "rewards/margins": 9.146402359008789, - "rewards/rejected": -9.010174751281738, - "step": 2680 + "learning_rate": 4.3124880474278065e-08, + "logits/chosen": -2.8079819679260254, + "logits/rejected": -2.870008945465088, + "logps/chosen": -219.47933959960938, + "logps/rejected": -364.24273681640625, + "loss": 0.0179, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.32802659273147583, + "rewards/margins": 11.084989547729492, + "rewards/rejected": -10.756962776184082, + "step": 5360 + }, + { + "epoch": 2.77, + "learning_rate": 4.2168674698795174e-08, + "logits/chosen": -2.738285779953003, + "logits/rejected": -2.7943713665008545, + "logps/chosen": -246.6175537109375, + "logps/rejected": -403.51226806640625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01692439243197441, + "rewards/margins": 12.024803161621094, + "rewards/rejected": -12.007880210876465, + "step": 5370 }, { "epoch": 2.78, - "learning_rate": 4.0949100650593186e-08, - "logits/chosen": -2.8010916709899902, - "logits/rejected": -2.8058903217315674, - "logps/chosen": -297.83599853515625, - "logps/rejected": -378.70892333984375, - "loss": 0.0113, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.6468523144721985, - "rewards/margins": 8.825592041015625, - "rewards/rejected": -9.472444534301758, - "step": 2690 + "learning_rate": 4.1212468923312296e-08, + "logits/chosen": -2.6675705909729004, + "logits/rejected": -2.708002805709839, + "logps/chosen": -237.9582061767578, + "logps/rejected": -425.62200927734375, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05177364498376846, + "rewards/margins": 11.023565292358398, + "rewards/rejected": -11.075338363647461, + "step": 5380 + }, + { + "epoch": 2.78, + "learning_rate": 4.025626314782941e-08, + "logits/chosen": -2.641225814819336, + "logits/rejected": -2.7300140857696533, + "logps/chosen": -268.275146484375, + "logps/rejected": -392.2470397949219, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6281940937042236, + "rewards/margins": 12.283098220825195, + "rewards/rejected": -11.65490436553955, + "step": 5390 }, { "epoch": 2.79, - "learning_rate": 3.903559127439724e-08, - "logits/chosen": -2.7962281703948975, - "logits/rejected": -2.7654640674591064, - "logps/chosen": -352.7866516113281, - "logps/rejected": -370.88226318359375, - "loss": 0.0127, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.05651530623435974, - "rewards/margins": 8.773322105407715, - "rewards/rejected": -8.716808319091797, - "step": 2700 + "learning_rate": 3.930005737234653e-08, + "logits/chosen": -2.786956310272217, + "logits/rejected": -2.852847099304199, + "logps/chosen": -188.55160522460938, + "logps/rejected": -326.9624938964844, + "loss": 0.0145, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.005254592280834913, + "rewards/margins": 10.982970237731934, + "rewards/rejected": -10.977715492248535, + "step": 5400 }, { "epoch": 2.79, - "eval_logits/chosen": -2.7651772499084473, - "eval_logits/rejected": -2.7437028884887695, - "eval_logps/chosen": -377.7486267089844, - "eval_logps/rejected": -345.6202392578125, - "eval_loss": 0.6868348717689514, - "eval_rewards/accuracies": 0.7699999809265137, - "eval_rewards/chosen": -2.4882779121398926, - "eval_rewards/margins": 3.661820411682129, - "eval_rewards/rejected": -6.1500983238220215, - "eval_runtime": 499.7574, - "eval_samples_per_second": 4.002, - "eval_steps_per_second": 0.5, - "step": 2700 + "eval_logits/chosen": -2.7451467514038086, + "eval_logits/rejected": -2.805893898010254, + "eval_logps/chosen": -268.75567626953125, + "eval_logps/rejected": -358.093017578125, + "eval_loss": 0.60563725233078, + "eval_rewards/accuracies": 0.8119999766349792, + "eval_rewards/chosen": -2.1075518131256104, + "eval_rewards/margins": 6.088010311126709, + "eval_rewards/rejected": -8.195561408996582, + "eval_runtime": 278.6585, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.449, + "step": 5400 + }, + { + "epoch": 2.79, + "learning_rate": 3.8343851596863644e-08, + "logits/chosen": -2.7843363285064697, + "logits/rejected": -2.8077871799468994, + "logps/chosen": -202.0702362060547, + "logps/rejected": -344.1629333496094, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09458614885807037, + "rewards/margins": 11.792885780334473, + "rewards/rejected": -11.698301315307617, + "step": 5410 }, { "epoch": 2.8, - "learning_rate": 3.71220818982013e-08, - "logits/chosen": -2.811039924621582, - "logits/rejected": -2.7763166427612305, - "logps/chosen": -326.4056701660156, - "logps/rejected": -355.9825744628906, - "loss": 0.0112, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.38241684436798096, - "rewards/margins": 8.648158073425293, - "rewards/rejected": -9.0305757522583, - "step": 2710 + "learning_rate": 3.738764582138076e-08, + "logits/chosen": -2.7376439571380615, + "logits/rejected": -2.723874568939209, + "logps/chosen": -299.224609375, + "logps/rejected": -439.83123779296875, + "loss": 0.0434, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3747190833091736, + "rewards/margins": 11.698442459106445, + "rewards/rejected": -12.073161125183105, + "step": 5420 + }, + { + "epoch": 2.8, + "learning_rate": 3.6431440045897875e-08, + "logits/chosen": -2.7386252880096436, + "logits/rejected": -2.763640880584717, + "logps/chosen": -245.27920532226562, + "logps/rejected": -422.24578857421875, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29534271359443665, + "rewards/margins": 11.861417770385742, + "rewards/rejected": -11.566075325012207, + "step": 5430 }, { "epoch": 2.81, - "learning_rate": 3.520857252200535e-08, - "logits/chosen": -2.7573506832122803, - "logits/rejected": -2.760462522506714, - "logps/chosen": -360.5853271484375, - "logps/rejected": -408.0951843261719, - "loss": 0.0193, + "learning_rate": 3.547523427041499e-08, + "logits/chosen": -2.8376080989837646, + "logits/rejected": -2.8734335899353027, + "logps/chosen": -261.5392761230469, + "logps/rejected": -430.12628173828125, + "loss": 0.009, "rewards/accuracies": 1.0, - "rewards/chosen": 0.13503775000572205, - "rewards/margins": 9.324322700500488, - "rewards/rejected": -9.18928337097168, - "step": 2720 + "rewards/chosen": 0.30467402935028076, + "rewards/margins": 11.386775970458984, + "rewards/rejected": -11.08210277557373, + "step": 5440 + }, + { + "epoch": 2.81, + "learning_rate": 3.4519028494932106e-08, + "logits/chosen": -2.691027879714966, + "logits/rejected": -2.7183306217193604, + "logps/chosen": -277.2605895996094, + "logps/rejected": -429.47998046875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7162610292434692, + "rewards/margins": 12.104765892028809, + "rewards/rejected": -11.388504028320312, + "step": 5450 }, { "epoch": 2.82, - "learning_rate": 3.3295063145809414e-08, - "logits/chosen": -2.8028712272644043, - "logits/rejected": -2.8158020973205566, - "logps/chosen": -311.5306396484375, - "logps/rejected": -366.0463562011719, - "loss": 0.0156, + "learning_rate": 3.356282271944923e-08, + "logits/chosen": -2.787083387374878, + "logits/rejected": -2.8620715141296387, + "logps/chosen": -223.35128784179688, + "logps/rejected": -418.36273193359375, + "loss": 0.0158, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.29903894662857056, - "rewards/margins": 8.486640930175781, - "rewards/rejected": -8.78568172454834, - "step": 2730 + "rewards/chosen": -0.23595857620239258, + "rewards/margins": 11.202669143676758, + "rewards/rejected": -11.438629150390625, + "step": 5460 + }, + { + "epoch": 2.82, + "learning_rate": 3.260661694396634e-08, + "logits/chosen": -2.8234875202178955, + "logits/rejected": -2.874922275543213, + "logps/chosen": -286.9738464355469, + "logps/rejected": -367.65643310546875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04693390056490898, + "rewards/margins": 11.051458358764648, + "rewards/rejected": -11.004526138305664, + "step": 5470 }, { "epoch": 2.83, - "learning_rate": 3.138155376961347e-08, - "logits/chosen": -2.806452512741089, - "logits/rejected": -2.800107002258301, - "logps/chosen": -352.19903564453125, - "logps/rejected": -366.1153869628906, - "loss": 0.0088, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.05723688751459122, - "rewards/margins": 8.847169876098633, - "rewards/rejected": -8.789933204650879, - "step": 2740 + "learning_rate": 3.165041116848346e-08, + "logits/chosen": -2.787111282348633, + "logits/rejected": -2.856330394744873, + "logps/chosen": -239.1962890625, + "logps/rejected": -455.84100341796875, + "loss": 0.0339, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.376078724861145, + "rewards/margins": 12.097007751464844, + "rewards/rejected": -11.720929145812988, + "step": 5480 + }, + { + "epoch": 2.83, + "learning_rate": 3.0694205393000576e-08, + "logits/chosen": -2.758702516555786, + "logits/rejected": -2.7954318523406982, + "logps/chosen": -168.82022094726562, + "logps/rejected": -369.0736083984375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.675735592842102, + "rewards/margins": 11.410593032836914, + "rewards/rejected": -10.734857559204102, + "step": 5490 }, { "epoch": 2.84, - "learning_rate": 2.9468044393417525e-08, - "logits/chosen": -2.8288140296936035, - "logits/rejected": -2.819532871246338, - "logps/chosen": -319.1861572265625, - "logps/rejected": -365.1618347167969, - "loss": 0.0146, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.272183895111084, - "rewards/margins": 8.840131759643555, - "rewards/rejected": -8.567949295043945, - "step": 2750 + "learning_rate": 2.9737999617517688e-08, + "logits/chosen": -2.759615898132324, + "logits/rejected": -2.7838168144226074, + "logps/chosen": -289.23992919921875, + "logps/rejected": -394.021240234375, + "loss": 0.0115, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.07635448127985, + "rewards/margins": 10.999418258666992, + "rewards/rejected": -10.923063278198242, + "step": 5500 + }, + { + "epoch": 2.84, + "eval_logits/chosen": -2.7522401809692383, + "eval_logits/rejected": -2.813889265060425, + "eval_logps/chosen": -267.77825927734375, + "eval_logps/rejected": -356.04461669921875, + "eval_loss": 0.5964898467063904, + "eval_rewards/accuracies": 0.8159999847412109, + "eval_rewards/chosen": -2.0098071098327637, + "eval_rewards/margins": 5.9809160232543945, + "eval_rewards/rejected": -7.990723609924316, + "eval_runtime": 278.4881, + "eval_samples_per_second": 7.182, + "eval_steps_per_second": 0.449, + "step": 5500 + }, + { + "epoch": 2.84, + "learning_rate": 2.8781793842034804e-08, + "logits/chosen": -2.708756685256958, + "logits/rejected": -2.6932451725006104, + "logps/chosen": -203.24278259277344, + "logps/rejected": -373.3500061035156, + "loss": 0.0186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17195767164230347, + "rewards/margins": 10.566624641418457, + "rewards/rejected": -10.738582611083984, + "step": 5510 }, { "epoch": 2.85, - "learning_rate": 2.755453501722158e-08, - "logits/chosen": -2.7816238403320312, - "logits/rejected": -2.7954201698303223, - "logps/chosen": -333.7603759765625, - "logps/rejected": -378.86236572265625, - "loss": 0.0179, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.39214563369750977, - "rewards/margins": 8.563885688781738, - "rewards/rejected": -8.956029891967773, - "step": 2760 + "learning_rate": 2.782558806655192e-08, + "logits/chosen": -2.726964235305786, + "logits/rejected": -2.790684223175049, + "logps/chosen": -295.1116027832031, + "logps/rejected": -394.7130432128906, + "loss": 0.0149, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.37252020835876465, + "rewards/margins": 11.06298542022705, + "rewards/rejected": -10.690465927124023, + "step": 5520 + }, + { + "epoch": 2.85, + "learning_rate": 2.6869382291069035e-08, + "logits/chosen": -2.790950298309326, + "logits/rejected": -2.818000555038452, + "logps/chosen": -251.74435424804688, + "logps/rejected": -388.2196960449219, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0213242769241333, + "rewards/margins": 11.515314102172852, + "rewards/rejected": -10.493988990783691, + "step": 5530 }, { "epoch": 2.86, - "learning_rate": 2.564102564102564e-08, - "logits/chosen": -2.768709659576416, - "logits/rejected": -2.787172794342041, - "logps/chosen": -321.18896484375, - "logps/rejected": -379.5654296875, - "loss": 0.0082, + "learning_rate": 2.591317651558615e-08, + "logits/chosen": -2.7424418926239014, + "logits/rejected": -2.795841693878174, + "logps/chosen": -264.7119445800781, + "logps/rejected": -386.6141052246094, + "loss": 0.0052, "rewards/accuracies": 1.0, - "rewards/chosen": -0.08633549511432648, - "rewards/margins": 8.47877311706543, - "rewards/rejected": -8.565108299255371, - "step": 2770 + "rewards/chosen": -0.02202761173248291, + "rewards/margins": 10.968466758728027, + "rewards/rejected": -10.990495681762695, + "step": 5540 }, { "epoch": 2.87, - "learning_rate": 2.3727516264829695e-08, - "logits/chosen": -2.8109827041625977, - "logits/rejected": -2.797381639480591, - "logps/chosen": -363.3731689453125, - "logps/rejected": -408.6158142089844, - "loss": 0.014, + "learning_rate": 2.4956970740103267e-08, + "logits/chosen": -2.7561378479003906, + "logits/rejected": -2.7803356647491455, + "logps/chosen": -244.2420196533203, + "logps/rejected": -442.84710693359375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20751342177391052, + "rewards/margins": 11.346160888671875, + "rewards/rejected": -11.553674697875977, + "step": 5550 + }, + { + "epoch": 2.87, + "learning_rate": 2.4000764964620386e-08, + "logits/chosen": -2.789926528930664, + "logits/rejected": -2.8296680450439453, + "logps/chosen": -339.80731201171875, + "logps/rejected": -456.09735107421875, + "loss": 0.0117, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.22468623518943787, - "rewards/margins": 9.187434196472168, - "rewards/rejected": -9.412120819091797, - "step": 2780 + "rewards/chosen": -0.2991110384464264, + "rewards/margins": 11.766742706298828, + "rewards/rejected": -12.065852165222168, + "step": 5560 }, { "epoch": 2.88, - "learning_rate": 2.1814006888633754e-08, - "logits/chosen": -2.75907564163208, - "logits/rejected": -2.7653088569641113, - "logps/chosen": -343.4961242675781, - "logps/rejected": -381.5200500488281, - "loss": 0.0168, + "learning_rate": 2.30445591891375e-08, + "logits/chosen": -2.821697473526001, + "logits/rejected": -2.8349757194519043, + "logps/chosen": -321.14044189453125, + "logps/rejected": -400.93121337890625, + "loss": 0.0102, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4573872983455658, - "rewards/margins": 9.753976821899414, - "rewards/rejected": -9.296588897705078, - "step": 2790 + "rewards/chosen": 0.1228589192032814, + "rewards/margins": 11.465642929077148, + "rewards/rejected": -11.342782974243164, + "step": 5570 + }, + { + "epoch": 2.88, + "learning_rate": 2.2088353413654617e-08, + "logits/chosen": -2.675318479537964, + "logits/rejected": -2.7310335636138916, + "logps/chosen": -190.40487670898438, + "logps/rejected": -328.63714599609375, + "loss": 0.0212, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.008567571640014648, + "rewards/margins": 11.152410507202148, + "rewards/rejected": -11.160978317260742, + "step": 5580 }, { "epoch": 2.89, - "learning_rate": 1.990049751243781e-08, - "logits/chosen": -2.74211049079895, - "logits/rejected": -2.787912368774414, - "logps/chosen": -346.13165283203125, - "logps/rejected": -360.9788513183594, - "loss": 0.0149, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.0645522028207779, - "rewards/margins": 8.483091354370117, - "rewards/rejected": -8.418540000915527, - "step": 2800 + "learning_rate": 2.1132147638171733e-08, + "logits/chosen": -2.7256431579589844, + "logits/rejected": -2.7801289558410645, + "logps/chosen": -292.7064514160156, + "logps/rejected": -366.49359130859375, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7942097187042236, + "rewards/margins": 11.075929641723633, + "rewards/rejected": -10.281720161437988, + "step": 5590 }, { "epoch": 2.89, - "eval_logits/chosen": -2.7622878551483154, - "eval_logits/rejected": -2.7409138679504395, - "eval_logps/chosen": -376.6426086425781, - "eval_logps/rejected": -344.5401306152344, - "eval_loss": 0.6851915121078491, - "eval_rewards/accuracies": 0.7699999809265137, - "eval_rewards/chosen": -2.3776779174804688, - "eval_rewards/margins": 3.66440486907959, - "eval_rewards/rejected": -6.0420823097229, - "eval_runtime": 499.4818, - "eval_samples_per_second": 4.004, - "eval_steps_per_second": 0.501, - "step": 2800 + "learning_rate": 2.0175941862688848e-08, + "logits/chosen": -2.7792868614196777, + "logits/rejected": -2.8381927013397217, + "logps/chosen": -220.96133422851562, + "logps/rejected": -358.05316162109375, + "loss": 0.0321, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2555966079235077, + "rewards/margins": 11.58265209197998, + "rewards/rejected": -11.327055931091309, + "step": 5600 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.7509632110595703, + "eval_logits/rejected": -2.8135623931884766, + "eval_logps/chosen": -268.1117858886719, + "eval_logps/rejected": -357.17138671875, + "eval_loss": 0.6050785183906555, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -2.043166399002075, + "eval_rewards/margins": 6.06022834777832, + "eval_rewards/rejected": -8.103395462036133, + "eval_runtime": 278.5351, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.449, + "step": 5600 }, { "epoch": 2.9, - "learning_rate": 1.7986988136241865e-08, - "logits/chosen": -2.7683703899383545, - "logits/rejected": -2.7223987579345703, - "logps/chosen": -355.0417175292969, - "logps/rejected": -366.55462646484375, - "loss": 0.0167, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.14950156211853027, - "rewards/margins": 8.354609489440918, - "rewards/rejected": -8.504112243652344, - "step": 2810 + "learning_rate": 1.9219736087205964e-08, + "logits/chosen": -2.7538223266601562, + "logits/rejected": -2.788954257965088, + "logps/chosen": -240.25765991210938, + "logps/rejected": -371.2794494628906, + "loss": 0.0157, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.055735863745212555, + "rewards/margins": 10.902502059936523, + "rewards/rejected": -10.958239555358887, + "step": 5610 + }, + { + "epoch": 2.9, + "learning_rate": 1.826353031172308e-08, + "logits/chosen": -2.747121810913086, + "logits/rejected": -2.759830951690674, + "logps/chosen": -335.62060546875, + "logps/rejected": -478.05084228515625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19696936011314392, + "rewards/margins": 11.078533172607422, + "rewards/rejected": -10.881563186645508, + "step": 5620 }, { "epoch": 2.91, - "learning_rate": 1.6073478760045924e-08, - "logits/chosen": -2.766155481338501, - "logits/rejected": -2.7364845275878906, - "logps/chosen": -369.193115234375, - "logps/rejected": -349.85107421875, - "loss": 0.0144, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08105222135782242, - "rewards/margins": 8.850485801696777, - "rewards/rejected": -8.76943302154541, - "step": 2820 + "learning_rate": 1.73073245362402e-08, + "logits/chosen": -2.7759406566619873, + "logits/rejected": -2.8184428215026855, + "logps/chosen": -253.0712890625, + "logps/rejected": -366.9938049316406, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.258527547121048, + "rewards/margins": 11.674029350280762, + "rewards/rejected": -11.932558059692383, + "step": 5630 + }, + { + "epoch": 2.91, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -2.789585590362549, + "logits/rejected": -2.819732666015625, + "logps/chosen": -250.55001831054688, + "logps/rejected": -388.7904052734375, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.04169013351202011, + "rewards/margins": 11.472790718078613, + "rewards/rejected": -11.431100845336914, + "step": 5640 }, { "epoch": 2.92, - "learning_rate": 1.4159969383849981e-08, - "logits/chosen": -2.757335662841797, - "logits/rejected": -2.735102415084839, - "logps/chosen": -344.85015869140625, - "logps/rejected": -362.55096435546875, - "loss": 0.0128, + "learning_rate": 1.539491298527443e-08, + "logits/chosen": -2.7909374237060547, + "logits/rejected": -2.8411381244659424, + "logps/chosen": -233.0658721923828, + "logps/rejected": -389.83880615234375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0178313497453928, + "rewards/margins": 11.449518203735352, + "rewards/rejected": -11.467348098754883, + "step": 5650 + }, + { + "epoch": 2.92, + "learning_rate": 1.4438707209791546e-08, + "logits/chosen": -2.77156925201416, + "logits/rejected": -2.8066821098327637, + "logps/chosen": -251.38528442382812, + "logps/rejected": -344.01568603515625, + "loss": 0.0159, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.3330300748348236, - "rewards/margins": 8.652267456054688, - "rewards/rejected": -8.985297203063965, - "step": 2830 + "rewards/chosen": 0.047732848674058914, + "rewards/margins": 10.928056716918945, + "rewards/rejected": -10.88032341003418, + "step": 5660 }, { "epoch": 2.93, - "learning_rate": 1.2246460007654037e-08, - "logits/chosen": -2.774275302886963, - "logits/rejected": -2.7630369663238525, - "logps/chosen": -316.2999572753906, - "logps/rejected": -335.5289611816406, - "loss": 0.0233, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.3553180992603302, - "rewards/margins": 8.010857582092285, - "rewards/rejected": -8.366175651550293, - "step": 2840 + "learning_rate": 1.3482501434308661e-08, + "logits/chosen": -2.7734055519104004, + "logits/rejected": -2.776326894760132, + "logps/chosen": -257.57904052734375, + "logps/rejected": -362.7672424316406, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4579504132270813, + "rewards/margins": 10.892066955566406, + "rewards/rejected": -10.43411636352539, + "step": 5670 + }, + { + "epoch": 2.93, + "learning_rate": 1.2526295658825777e-08, + "logits/chosen": -2.8509745597839355, + "logits/rejected": -2.8811259269714355, + "logps/chosen": -281.4604187011719, + "logps/rejected": -409.92816162109375, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2449922263622284, + "rewards/margins": 11.449122428894043, + "rewards/rejected": -11.204131126403809, + "step": 5680 }, { "epoch": 2.94, - "learning_rate": 1.0332950631458094e-08, - "logits/chosen": -2.743933916091919, - "logits/rejected": -2.7586922645568848, - "logps/chosen": -362.8529052734375, - "logps/rejected": -367.4391784667969, - "loss": 0.0147, + "learning_rate": 1.1570089883342895e-08, + "logits/chosen": -2.730487108230591, + "logits/rejected": -2.734302043914795, + "logps/chosen": -316.2430114746094, + "logps/rejected": -447.566650390625, + "loss": 0.0113, "rewards/accuracies": 1.0, - "rewards/chosen": 0.43926841020584106, - "rewards/margins": 9.078571319580078, - "rewards/rejected": -8.639303207397461, - "step": 2850 + "rewards/chosen": 0.04693268612027168, + "rewards/margins": 12.631341934204102, + "rewards/rejected": -12.5844087600708, + "step": 5690 + }, + { + "epoch": 2.94, + "learning_rate": 1.061388410786001e-08, + "logits/chosen": -2.731062650680542, + "logits/rejected": -2.831976890563965, + "logps/chosen": -206.12698364257812, + "logps/rejected": -342.6966552734375, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0030780285596847534, + "rewards/margins": 10.542811393737793, + "rewards/rejected": -10.539732933044434, + "step": 5700 + }, + { + "epoch": 2.94, + "eval_logits/chosen": -2.747497320175171, + "eval_logits/rejected": -2.810014486312866, + "eval_logps/chosen": -267.9060974121094, + "eval_logps/rejected": -357.0297546386719, + "eval_loss": 0.6040644645690918, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -2.0225942134857178, + "eval_rewards/margins": 6.066638469696045, + "eval_rewards/rejected": -8.0892333984375, + "eval_runtime": 278.4442, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.449, + "step": 5700 }, { "epoch": 2.95, - "learning_rate": 8.419441255262151e-09, - "logits/chosen": -2.7890658378601074, - "logits/rejected": -2.7745556831359863, - "logps/chosen": -299.62799072265625, - "logps/rejected": -343.54083251953125, - "loss": 0.0124, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.6124383211135864, - "rewards/margins": 8.287772178649902, - "rewards/rejected": -8.900211334228516, - "step": 2860 + "learning_rate": 9.657678332377126e-09, + "logits/chosen": -2.7119622230529785, + "logits/rejected": -2.760438919067383, + "logps/chosen": -233.8179931640625, + "logps/rejected": -365.3365783691406, + "loss": 0.0147, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6185123920440674, + "rewards/margins": 11.846052169799805, + "rewards/rejected": -11.227540016174316, + "step": 5710 + }, + { + "epoch": 2.95, + "learning_rate": 8.701472556894243e-09, + "logits/chosen": -2.7717738151550293, + "logits/rejected": -2.7559776306152344, + "logps/chosen": -206.87277221679688, + "logps/rejected": -384.5017395019531, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37782225012779236, + "rewards/margins": 11.53270435333252, + "rewards/rejected": -11.910527229309082, + "step": 5720 + }, + { + "epoch": 2.96, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -2.793654203414917, + "logits/rejected": -2.835156202316284, + "logps/chosen": -264.3742370605469, + "logps/rejected": -431.3934631347656, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7172238230705261, + "rewards/margins": 11.648252487182617, + "rewards/rejected": -10.931028366088867, + "step": 5730 }, { "epoch": 2.96, - "learning_rate": 6.505931879066207e-09, - "logits/chosen": -2.8043646812438965, - "logits/rejected": -2.8070147037506104, - "logps/chosen": -316.2520446777344, - "logps/rejected": -351.5118103027344, - "loss": 0.0224, + "learning_rate": 6.7890610059284754e-09, + "logits/chosen": -2.7689952850341797, + "logits/rejected": -2.7692933082580566, + "logps/chosen": -179.81179809570312, + "logps/rejected": -330.4810485839844, + "loss": 0.024, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.41816624999046326, - "rewards/margins": 8.223325729370117, - "rewards/rejected": -8.64149284362793, - "step": 2870 + "rewards/chosen": 0.23751434683799744, + "rewards/margins": 10.173518180847168, + "rewards/rejected": -9.936005592346191, + "step": 5740 }, { "epoch": 2.97, - "learning_rate": 4.592422502870264e-09, - "logits/chosen": -2.7751667499542236, - "logits/rejected": -2.775235414505005, - "logps/chosen": -327.26068115234375, - "logps/rejected": -369.03839111328125, - "loss": 0.0158, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.019028399139642715, - "rewards/margins": 9.00733757019043, - "rewards/rejected": -9.026365280151367, - "step": 2880 + "learning_rate": 5.832855230445592e-09, + "logits/chosen": -2.700852870941162, + "logits/rejected": -2.771523952484131, + "logps/chosen": -291.72332763671875, + "logps/rejected": -378.6692810058594, + "loss": 0.0458, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.45632845163345337, + "rewards/margins": 11.953349113464355, + "rewards/rejected": -11.497020721435547, + "step": 5750 + }, + { + "epoch": 2.97, + "learning_rate": 4.8766494549627085e-09, + "logits/chosen": -2.7623281478881836, + "logits/rejected": -2.7841367721557617, + "logps/chosen": -276.88714599609375, + "logps/rejected": -383.640380859375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8869491815567017, + "rewards/margins": 12.092225074768066, + "rewards/rejected": -11.205277442932129, + "step": 5760 }, { "epoch": 2.98, - "learning_rate": 2.6789131266743202e-09, - "logits/chosen": -2.8183040618896484, - "logits/rejected": -2.7807929515838623, - "logps/chosen": -337.8726806640625, - "logps/rejected": -379.15032958984375, - "loss": 0.0222, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.21741366386413574, - "rewards/margins": 8.339401245117188, - "rewards/rejected": -8.556814193725586, - "step": 2890 + "learning_rate": 3.920443679479824e-09, + "logits/chosen": -2.8144805431365967, + "logits/rejected": -2.8412563800811768, + "logps/chosen": -268.53143310546875, + "logps/rejected": -369.7856750488281, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38506728410720825, + "rewards/margins": 10.940896034240723, + "rewards/rejected": -10.555828094482422, + "step": 5770 }, { - "epoch": 3.0, - "learning_rate": 7.654037504783773e-10, - "logits/chosen": -2.847442865371704, - "logits/rejected": -2.8513660430908203, - "logps/chosen": -318.14996337890625, - "logps/rejected": -386.6273498535156, - "loss": 0.0105, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.028394419699907303, - "rewards/margins": 8.268038749694824, - "rewards/rejected": -8.239645004272461, - "step": 2900 + "epoch": 2.98, + "learning_rate": 2.96423790399694e-09, + "logits/chosen": -2.7938127517700195, + "logits/rejected": -2.842219591140747, + "logps/chosen": -228.3662567138672, + "logps/rejected": -393.7285461425781, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6316030025482178, + "rewards/margins": 11.577722549438477, + "rewards/rejected": -10.94611930847168, + "step": 5780 + }, + { + "epoch": 2.99, + "learning_rate": 2.008032128514056e-09, + "logits/chosen": -2.8088643550872803, + "logits/rejected": -2.828951597213745, + "logps/chosen": -286.57672119140625, + "logps/rejected": -427.19659423828125, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6554132103919983, + "rewards/margins": 12.086005210876465, + "rewards/rejected": -11.430593490600586, + "step": 5790 + }, + { + "epoch": 2.99, + "learning_rate": 1.0518263530311723e-09, + "logits/chosen": -2.7882561683654785, + "logits/rejected": -2.8440985679626465, + "logps/chosen": -204.60406494140625, + "logps/rejected": -374.4756774902344, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47186797857284546, + "rewards/margins": 10.928264617919922, + "rewards/rejected": -10.456398010253906, + "step": 5800 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.7457115650177, + "eval_logits/rejected": -2.808168411254883, + "eval_logps/chosen": -267.2555847167969, + "eval_logps/rejected": -356.2176208496094, + "eval_loss": 0.6030594110488892, + "eval_rewards/accuracies": 0.8140000104904175, + "eval_rewards/chosen": -1.9575421810150146, + "eval_rewards/margins": 6.05047607421875, + "eval_rewards/rejected": -8.008018493652344, + "eval_runtime": 278.3945, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.449, + "step": 5800 }, { "epoch": 3.0, - "eval_logits/chosen": -2.7662971019744873, - "eval_logits/rejected": -2.7454662322998047, - "eval_logps/chosen": -376.1639099121094, - "eval_logps/rejected": -344.25634765625, - "eval_loss": 0.6832324266433716, - "eval_rewards/accuracies": 0.7639999985694885, - "eval_rewards/chosen": -2.3298091888427734, - "eval_rewards/margins": 3.683899402618408, - "eval_rewards/rejected": -6.013708591461182, - "eval_runtime": 499.2953, - "eval_samples_per_second": 4.006, - "eval_steps_per_second": 0.501, - "step": 2900 + "learning_rate": 9.562057754828839e-11, + "logits/chosen": -2.7588553428649902, + "logits/rejected": -2.836669445037842, + "logps/chosen": -241.68515014648438, + "logps/rejected": -372.9002380371094, + "loss": 0.0155, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.14204326272010803, + "rewards/margins": 10.061766624450684, + "rewards/rejected": -9.919723510742188, + "step": 5810 }, { "epoch": 3.0, - "step": 2904, + "step": 5811, "total_flos": 0.0, - "train_loss": 0.2301063642942298, - "train_runtime": 127972.4677, - "train_samples_per_second": 1.453, - "train_steps_per_second": 0.023 + "train_loss": 0.19806672207460788, + "train_runtime": 74526.9689, + "train_samples_per_second": 2.494, + "train_steps_per_second": 0.078 } ], "logging_steps": 10, - "max_steps": 2904, + "max_steps": 5811, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0,