{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981378026070763, "eval_steps": 500, "global_step": 134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 20.656896651231627, "learning_rate": 3.5714285714285716e-07, "loss": 3.6136, "step": 1, "trainloss/critic_chosen": 1.459133505821228, "trainloss/critic_rejected": 1.468864917755127, "trainloss/reward": 1.459133505821228, "trainrewards/accuracies": 0.5833333134651184, "trainrewards/chosen": 0.3359375, "trainrewards/margins": 0.0308837890625, "trainrewards/rejected": 0.3046875 }, { "epoch": 0.01, "grad_norm": 20.747827742934472, "learning_rate": 7.142857142857143e-07, "loss": 3.6381, "step": 2, "trainloss/critic_chosen": 1.4447739124298096, "trainloss/critic_rejected": 1.4999535083770752, "trainloss/reward": 1.4447739124298096, "trainrewards/accuracies": 0.5104166865348816, "trainrewards/chosen": 0.314453125, "trainrewards/margins": 0.01531982421875, "trainrewards/rejected": 0.298828125 }, { "epoch": 0.02, "grad_norm": 19.429850511676147, "learning_rate": 1.0714285714285714e-06, "loss": 3.6713, "step": 3, "trainloss/critic_chosen": 1.4738179445266724, "trainloss/critic_rejected": 1.505049228668213, "trainloss/reward": 1.4738179445266724, "trainrewards/accuracies": 0.5364583134651184, "trainrewards/chosen": 0.302734375, "trainrewards/margins": 0.015869140625, "trainrewards/rejected": 0.287109375 }, { "epoch": 0.03, "grad_norm": 19.353394477551088, "learning_rate": 1.4285714285714286e-06, "loss": 3.6414, "step": 4, "trainloss/critic_chosen": 1.4593632221221924, "trainloss/critic_rejected": 1.4944238662719727, "trainloss/reward": 1.4593632221221924, "trainrewards/accuracies": 0.5572916865348816, "trainrewards/chosen": 0.357421875, "trainrewards/margins": 0.0286865234375, "trainrewards/rejected": 0.328125 }, { "epoch": 0.04, "grad_norm": 19.314515274756275, "learning_rate": 1.7857142857142859e-06, "loss": 3.6014, "step": 5, "trainloss/critic_chosen": 1.4528993368148804, "trainloss/critic_rejected": 1.5066261291503906, "trainloss/reward": 1.4528993368148804, "trainrewards/accuracies": 0.7760417461395264, "trainrewards/chosen": 0.494140625, "trainrewards/margins": 0.130859375, "trainrewards/rejected": 0.36328125 }, { "epoch": 0.04, "grad_norm": 17.65488892011503, "learning_rate": 2.1428571428571427e-06, "loss": 3.5132, "step": 6, "trainloss/critic_chosen": 1.4163868427276611, "trainloss/critic_rejected": 1.4701489210128784, "trainloss/reward": 1.4163868427276611, "trainrewards/accuracies": 0.7864583730697632, "trainrewards/chosen": 0.54296875, "trainrewards/margins": 0.169921875, "trainrewards/rejected": 0.373046875 }, { "epoch": 0.05, "grad_norm": 14.922878013160528, "learning_rate": 2.5e-06, "loss": 3.4609, "step": 7, "trainloss/critic_chosen": 1.4025382995605469, "trainloss/critic_rejected": 1.4922353029251099, "trainloss/reward": 1.4025382995605469, "trainrewards/accuracies": 0.8281250596046448, "trainrewards/chosen": 0.9453125, "trainrewards/margins": 0.376953125, "trainrewards/rejected": 0.56640625 }, { "epoch": 0.06, "grad_norm": 14.232594419094823, "learning_rate": 2.8571428571428573e-06, "loss": 3.3368, "step": 8, "trainloss/critic_chosen": 1.3685592412948608, "trainloss/critic_rejected": 1.4283447265625, "trainloss/reward": 1.3685592412948608, "trainrewards/accuracies": 0.8645833730697632, "trainrewards/chosen": 1.0078125, "trainrewards/margins": 0.47265625, "trainrewards/rejected": 0.53515625 }, { "epoch": 0.07, "grad_norm": 12.480541507697524, "learning_rate": 3.2142857142857147e-06, "loss": 3.0686, "step": 9, "trainloss/critic_chosen": 1.3232171535491943, "trainloss/critic_rejected": 1.3606585264205933, "trainloss/reward": 1.3232171535491943, "trainrewards/accuracies": 0.9114583134651184, "trainrewards/chosen": 1.671875, "trainrewards/margins": 1.3359375, "trainrewards/rejected": 0.333984375 }, { "epoch": 0.07, "grad_norm": 8.25153148096521, "learning_rate": 3.5714285714285718e-06, "loss": 3.0194, "step": 10, "trainloss/critic_chosen": 1.289527177810669, "trainloss/critic_rejected": 1.3465213775634766, "trainloss/reward": 1.289527177810669, "trainrewards/accuracies": 0.9010416865348816, "trainrewards/chosen": 1.3125, "trainrewards/margins": 1.4375, "trainrewards/rejected": -0.1220703125 }, { "epoch": 0.08, "grad_norm": 15.573626174864643, "learning_rate": 3.928571428571429e-06, "loss": 2.9476, "step": 11, "trainloss/critic_chosen": 1.266632318496704, "trainloss/critic_rejected": 1.3359544277191162, "trainloss/reward": 1.266632318496704, "trainrewards/accuracies": 0.9010417461395264, "trainrewards/chosen": 0.208984375, "trainrewards/margins": 2.25, "trainrewards/rejected": -2.046875 }, { "epoch": 0.09, "grad_norm": 8.798738351150583, "learning_rate": 4.2857142857142855e-06, "loss": 2.9208, "step": 12, "trainloss/critic_chosen": 1.285549521446228, "trainloss/critic_rejected": 1.336460828781128, "trainloss/reward": 1.285549521446228, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 1.390625, "trainrewards/margins": 2.453125, "trainrewards/rejected": -1.0625 }, { "epoch": 0.1, "grad_norm": 10.920314695532706, "learning_rate": 4.642857142857144e-06, "loss": 2.9737, "step": 13, "trainloss/critic_chosen": 1.3031879663467407, "trainloss/critic_rejected": 1.3538345098495483, "trainloss/reward": 1.3031879663467407, "trainrewards/accuracies": 0.9583333134651184, "trainrewards/chosen": 1.9453125, "trainrewards/margins": 2.203125, "trainrewards/rejected": -0.255859375 }, { "epoch": 0.1, "grad_norm": 7.902411216600844, "learning_rate": 5e-06, "loss": 2.8568, "step": 14, "trainloss/critic_chosen": 1.2445811033248901, "trainloss/critic_rejected": 1.3195788860321045, "trainloss/reward": 1.2445811033248901, "trainrewards/accuracies": 0.90625, "trainrewards/chosen": 1.5078125, "trainrewards/margins": 2.09375, "trainrewards/rejected": -0.5859375 }, { "epoch": 0.11, "grad_norm": 11.497405609399582, "learning_rate": 4.999143312438893e-06, "loss": 2.8485, "step": 15, "trainloss/critic_chosen": 1.2575112581253052, "trainloss/critic_rejected": 1.303661823272705, "trainloss/reward": 1.2575112581253052, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 0.55078125, "trainrewards/margins": 1.84375, "trainrewards/rejected": -1.2890625 }, { "epoch": 0.12, "grad_norm": 8.767557307270321, "learning_rate": 4.9965738368864345e-06, "loss": 2.8737, "step": 16, "trainloss/critic_chosen": 1.2387288808822632, "trainloss/critic_rejected": 1.3038721084594727, "trainloss/reward": 1.2387288808822632, "trainrewards/accuracies": 0.9010416865348816, "trainrewards/chosen": 1.578125, "trainrewards/margins": 2.59375, "trainrewards/rejected": -1.015625 }, { "epoch": 0.13, "grad_norm": 8.749611939075479, "learning_rate": 4.992293334332821e-06, "loss": 2.8681, "step": 17, "trainloss/critic_chosen": 1.2373957633972168, "trainloss/critic_rejected": 1.302640676498413, "trainloss/reward": 1.2373957633972168, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.5859375, "trainrewards/margins": 2.203125, "trainrewards/rejected": -0.61328125 }, { "epoch": 0.13, "grad_norm": 8.484573172931489, "learning_rate": 4.986304738420684e-06, "loss": 2.8305, "step": 18, "trainloss/critic_chosen": 1.23964262008667, "trainloss/critic_rejected": 1.297165870666504, "trainloss/reward": 1.23964262008667, "trainrewards/accuracies": 0.9166666865348816, "trainrewards/chosen": 0.60546875, "trainrewards/margins": 1.765625, "trainrewards/rejected": -1.15625 }, { "epoch": 0.14, "grad_norm": 5.43676917020596, "learning_rate": 4.978612153434527e-06, "loss": 2.7193, "step": 19, "trainloss/critic_chosen": 1.2180960178375244, "trainloss/critic_rejected": 1.2327347993850708, "trainloss/reward": 1.2180960178375244, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.5546875, "trainrewards/margins": 2.296875, "trainrewards/rejected": -0.7421875 }, { "epoch": 0.15, "grad_norm": 5.651490595569425, "learning_rate": 4.9692208514878445e-06, "loss": 2.8528, "step": 20, "trainloss/critic_chosen": 1.2103852033615112, "trainloss/critic_rejected": 1.3035379648208618, "trainloss/reward": 1.2103852033615112, "trainrewards/accuracies": 0.9062500596046448, "trainrewards/chosen": 1.75, "trainrewards/margins": 2.6875, "trainrewards/rejected": -0.94921875 }, { "epoch": 0.16, "grad_norm": 5.1975082682744524, "learning_rate": 4.958137268909887e-06, "loss": 2.7287, "step": 21, "trainloss/critic_chosen": 1.1857198476791382, "trainloss/critic_rejected": 1.2193048000335693, "trainloss/reward": 1.1857198476791382, "trainrewards/accuracies": 0.9114583730697632, "trainrewards/chosen": 1.4296875, "trainrewards/margins": 2.21875, "trainrewards/rejected": -0.7890625 }, { "epoch": 0.16, "grad_norm": 5.466958288822169, "learning_rate": 4.9453690018345144e-06, "loss": 2.7514, "step": 22, "trainloss/critic_chosen": 1.1868751049041748, "trainloss/critic_rejected": 1.256333827972412, "trainloss/reward": 1.1868751049041748, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 0.953125, "trainrewards/margins": 1.71875, "trainrewards/rejected": -0.76171875 }, { "epoch": 0.17, "grad_norm": 4.795005616591591, "learning_rate": 4.930924800994192e-06, "loss": 2.7025, "step": 23, "trainloss/critic_chosen": 1.1841645240783691, "trainloss/critic_rejected": 1.2626478672027588, "trainloss/reward": 1.1841645240783691, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.0625, "trainrewards/margins": 2.09375, "trainrewards/rejected": -1.0390625 }, { "epoch": 0.18, "grad_norm": 7.216599935232891, "learning_rate": 4.914814565722671e-06, "loss": 2.7024, "step": 24, "trainloss/critic_chosen": 1.1600149869918823, "trainloss/critic_rejected": 1.216670036315918, "trainloss/reward": 1.1600149869918823, "trainrewards/accuracies": 0.90625, "trainrewards/chosen": 1.953125, "trainrewards/margins": 2.53125, "trainrewards/rejected": -0.578125 }, { "epoch": 0.19, "grad_norm": 5.574536341669933, "learning_rate": 4.897049337170483e-06, "loss": 2.6825, "step": 25, "trainloss/critic_chosen": 1.17496919631958, "trainloss/critic_rejected": 1.2430278062820435, "trainloss/reward": 1.17496919631958, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.84375, "trainrewards/margins": 2.71875, "trainrewards/rejected": -0.87109375 }, { "epoch": 0.19, "grad_norm": 8.130831144336229, "learning_rate": 4.8776412907378845e-06, "loss": 2.7403, "step": 26, "trainloss/critic_chosen": 1.1843974590301514, "trainloss/critic_rejected": 1.2316250801086426, "trainloss/reward": 1.1843974590301514, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 0.322265625, "trainrewards/margins": 2.09375, "trainrewards/rejected": -1.78125 }, { "epoch": 0.2, "grad_norm": 4.106462749941039, "learning_rate": 4.856603727730446e-06, "loss": 2.6318, "step": 27, "trainloss/critic_chosen": 1.1325989961624146, "trainloss/critic_rejected": 1.2125966548919678, "trainloss/reward": 1.1325989961624146, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 0.98828125, "trainrewards/margins": 1.859375, "trainrewards/rejected": -0.875 }, { "epoch": 0.21, "grad_norm": 7.501840024960186, "learning_rate": 4.833951066243004e-06, "loss": 2.7439, "step": 28, "trainloss/critic_chosen": 1.156808853149414, "trainloss/critic_rejected": 1.218095064163208, "trainloss/reward": 1.156808853149414, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 2.03125, "trainrewards/margins": 2.0, "trainrewards/rejected": 0.021240234375 }, { "epoch": 0.22, "grad_norm": 10.542645887143404, "learning_rate": 4.809698831278217e-06, "loss": 2.6949, "step": 29, "trainloss/critic_chosen": 1.146854043006897, "trainloss/critic_rejected": 1.2151950597763062, "trainloss/reward": 1.146854043006897, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 2.546875, "trainrewards/margins": 2.375, "trainrewards/rejected": 0.169921875 }, { "epoch": 0.22, "grad_norm": 5.112451478263716, "learning_rate": 4.783863644106502e-06, "loss": 2.6784, "step": 30, "trainloss/critic_chosen": 1.1554011106491089, "trainloss/critic_rejected": 1.2330732345581055, "trainloss/reward": 1.1554011106491089, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.6875, "trainrewards/margins": 2.390625, "trainrewards/rejected": -0.703125 }, { "epoch": 0.23, "grad_norm": 5.406168864359725, "learning_rate": 4.7564632108746524e-06, "loss": 2.716, "step": 31, "trainloss/critic_chosen": 1.162062168121338, "trainloss/critic_rejected": 1.2383043766021729, "trainloss/reward": 1.162062168121338, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 0.609375, "trainrewards/margins": 1.9765625, "trainrewards/rejected": -1.3671875 }, { "epoch": 0.24, "grad_norm": 5.067065444400827, "learning_rate": 4.72751631047092e-06, "loss": 2.6516, "step": 32, "trainloss/critic_chosen": 1.1599905490875244, "trainloss/critic_rejected": 1.2166763544082642, "trainloss/reward": 1.1599905490875244, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 0.7265625, "trainrewards/margins": 2.21875, "trainrewards/rejected": -1.4921875 }, { "epoch": 0.25, "grad_norm": 6.759001147106114, "learning_rate": 4.697042781654913e-06, "loss": 2.6586, "step": 33, "trainloss/critic_chosen": 1.1513490676879883, "trainloss/critic_rejected": 1.1805285215377808, "trainloss/reward": 1.1513490676879883, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.8203125, "trainrewards/margins": 2.234375, "trainrewards/rejected": -0.408203125 }, { "epoch": 0.25, "grad_norm": 7.671545596305826, "learning_rate": 4.665063509461098e-06, "loss": 2.6397, "step": 34, "trainloss/critic_chosen": 1.1355525255203247, "trainloss/critic_rejected": 1.1824406385421753, "trainloss/reward": 1.1355525255203247, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 2.15625, "trainrewards/margins": 2.34375, "trainrewards/rejected": -0.1923828125 }, { "epoch": 0.26, "grad_norm": 4.120967770831028, "learning_rate": 4.631600410885231e-06, "loss": 2.6941, "step": 35, "trainloss/critic_chosen": 1.1876243352890015, "trainloss/critic_rejected": 1.2462928295135498, "trainloss/reward": 1.1876243352890015, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.6640625, "trainrewards/margins": 2.453125, "trainrewards/rejected": -0.78125 }, { "epoch": 0.27, "grad_norm": 4.873851901547121, "learning_rate": 4.596676419863561e-06, "loss": 2.5644, "step": 36, "trainloss/critic_chosen": 1.1080451011657715, "trainloss/critic_rejected": 1.1967337131500244, "trainloss/reward": 1.1080451011657715, "trainrewards/accuracies": 0.96875, "trainrewards/chosen": 0.80078125, "trainrewards/margins": 2.125, "trainrewards/rejected": -1.328125 }, { "epoch": 0.28, "grad_norm": 3.9620697117121004, "learning_rate": 4.560315471555039e-06, "loss": 2.5956, "step": 37, "trainloss/critic_chosen": 1.1373480558395386, "trainloss/critic_rejected": 1.2142869234085083, "trainloss/reward": 1.1373480558395386, "trainrewards/accuracies": 0.9375000596046448, "trainrewards/chosen": 1.0234375, "trainrewards/margins": 2.40625, "trainrewards/rejected": -1.390625 }, { "epoch": 0.28, "grad_norm": 5.768390511994523, "learning_rate": 4.522542485937369e-06, "loss": 2.6888, "step": 38, "trainloss/critic_chosen": 1.1469529867172241, "trainloss/critic_rejected": 1.2045722007751465, "trainloss/reward": 1.1469529867172241, "trainrewards/accuracies": 0.9114583730697632, "trainrewards/chosen": 1.8046875, "trainrewards/margins": 2.484375, "trainrewards/rejected": -0.6875 }, { "epoch": 0.29, "grad_norm": 4.874609149891752, "learning_rate": 4.4833833507280884e-06, "loss": 2.5543, "step": 39, "trainloss/critic_chosen": 1.1098886728286743, "trainloss/critic_rejected": 1.1714903116226196, "trainloss/reward": 1.1098886728286743, "trainrewards/accuracies": 0.958333432674408, "trainrewards/chosen": 1.8984375, "trainrewards/margins": 2.625, "trainrewards/rejected": -0.7265625 }, { "epoch": 0.3, "grad_norm": 3.396284173013532, "learning_rate": 4.442864903642428e-06, "loss": 2.6564, "step": 40, "trainloss/critic_chosen": 1.1380069255828857, "trainloss/critic_rejected": 1.2159972190856934, "trainloss/reward": 1.1380069255828857, "trainrewards/accuracies": 0.9427083134651184, "trainrewards/chosen": 0.9296875, "trainrewards/margins": 1.9375, "trainrewards/rejected": -1.0078125 }, { "epoch": 0.31, "grad_norm": 3.8092153086439087, "learning_rate": 4.401014914000078e-06, "loss": 2.5515, "step": 41, "trainloss/critic_chosen": 1.123491883277893, "trainloss/critic_rejected": 1.1983730792999268, "trainloss/reward": 1.123491883277893, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.015625, "trainrewards/margins": 2.125, "trainrewards/rejected": -1.1015625 }, { "epoch": 0.31, "grad_norm": 3.292224209377405, "learning_rate": 4.357862063693486e-06, "loss": 2.6296, "step": 42, "trainloss/critic_chosen": 1.1296896934509277, "trainloss/critic_rejected": 1.193892002105713, "trainloss/reward": 1.1296896934509277, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.3671875, "trainrewards/margins": 2.234375, "trainrewards/rejected": -0.859375 }, { "epoch": 0.32, "grad_norm": 4.97304962284229, "learning_rate": 4.313435927530719e-06, "loss": 2.5984, "step": 43, "trainloss/critic_chosen": 1.106866478919983, "trainloss/critic_rejected": 1.1839522123336792, "trainloss/reward": 1.106866478919983, "trainrewards/accuracies": 0.9166666865348816, "trainrewards/chosen": 1.859375, "trainrewards/margins": 2.515625, "trainrewards/rejected": -0.6640625 }, { "epoch": 0.33, "grad_norm": 3.1437931542625615, "learning_rate": 4.267766952966369e-06, "loss": 2.6053, "step": 44, "trainloss/critic_chosen": 1.141026496887207, "trainloss/critic_rejected": 1.1881659030914307, "trainloss/reward": 1.141026496887207, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.484375, "trainrewards/margins": 2.5, "trainrewards/rejected": -1.015625 }, { "epoch": 0.34, "grad_norm": 3.0260425195721727, "learning_rate": 4.220886439234385e-06, "loss": 2.6162, "step": 45, "trainloss/critic_chosen": 1.1437909603118896, "trainloss/critic_rejected": 1.1694350242614746, "trainloss/reward": 1.1437909603118896, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.3359375, "trainrewards/margins": 2.265625, "trainrewards/rejected": -0.93359375 }, { "epoch": 0.34, "grad_norm": 3.9421991947992803, "learning_rate": 4.172826515897146e-06, "loss": 2.559, "step": 46, "trainloss/critic_chosen": 1.1193464994430542, "trainloss/critic_rejected": 1.1624045372009277, "trainloss/reward": 1.1193464994430542, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.2109375, "trainrewards/margins": 1.96875, "trainrewards/rejected": -0.75 }, { "epoch": 0.35, "grad_norm": 4.76800798471375, "learning_rate": 4.123620120825459e-06, "loss": 2.5633, "step": 47, "trainloss/critic_chosen": 1.1039447784423828, "trainloss/critic_rejected": 1.1683855056762695, "trainloss/reward": 1.1039447784423828, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.5, "trainrewards/margins": 1.8515625, "trainrewards/rejected": -0.357421875 }, { "epoch": 0.36, "grad_norm": 4.677899041279874, "learning_rate": 4.073300977624594e-06, "loss": 2.6104, "step": 48, "trainloss/critic_chosen": 1.1293267011642456, "trainloss/critic_rejected": 1.173600435256958, "trainloss/reward": 1.1293267011642456, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.5625, "trainrewards/margins": 1.953125, "trainrewards/rejected": -0.38671875 }, { "epoch": 0.36, "grad_norm": 2.8973280324668815, "learning_rate": 4.021903572521802e-06, "loss": 2.5884, "step": 49, "trainloss/critic_chosen": 1.1289738416671753, "trainloss/critic_rejected": 1.169731855392456, "trainloss/reward": 1.1289738416671753, "trainrewards/accuracies": 0.9375000596046448, "trainrewards/chosen": 1.3125, "trainrewards/margins": 2.515625, "trainrewards/rejected": -1.203125 }, { "epoch": 0.37, "grad_norm": 2.9211772383175685, "learning_rate": 3.969463130731183e-06, "loss": 2.5868, "step": 50, "trainloss/critic_chosen": 1.1367411613464355, "trainloss/critic_rejected": 1.1725157499313354, "trainloss/reward": 1.1367411613464355, "trainrewards/accuracies": 0.9114583730697632, "trainrewards/chosen": 1.265625, "trainrewards/margins": 2.484375, "trainrewards/rejected": -1.21875 }, { "epoch": 0.38, "grad_norm": 3.4239805909008627, "learning_rate": 3.916015592312083e-06, "loss": 2.5442, "step": 51, "trainloss/critic_chosen": 1.101191759109497, "trainloss/critic_rejected": 1.2063257694244385, "trainloss/reward": 1.101191759109497, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.6171875, "trainrewards/margins": 2.546875, "trainrewards/rejected": -0.9296875 }, { "epoch": 0.39, "grad_norm": 3.3449791279382155, "learning_rate": 3.861597587537568e-06, "loss": 2.5532, "step": 52, "trainloss/critic_chosen": 1.1064534187316895, "trainloss/critic_rejected": 1.1979490518569946, "trainloss/reward": 1.1064534187316895, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.6015625, "trainrewards/margins": 2.46875, "trainrewards/rejected": -0.875 }, { "epoch": 0.39, "grad_norm": 3.9419484082989724, "learning_rate": 3.806246411789872e-06, "loss": 2.6147, "step": 53, "trainloss/critic_chosen": 1.138405680656433, "trainloss/critic_rejected": 1.1973538398742676, "trainloss/reward": 1.138405680656433, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.25, "trainrewards/margins": 2.609375, "trainrewards/rejected": -1.3671875 }, { "epoch": 0.4, "grad_norm": 3.4980254593155413, "learning_rate": 3.7500000000000005e-06, "loss": 2.5364, "step": 54, "trainloss/critic_chosen": 1.0845965147018433, "trainloss/critic_rejected": 1.1989306211471558, "trainloss/reward": 1.0845965147018433, "trainrewards/accuracies": 0.9635417461395264, "trainrewards/chosen": 1.6953125, "trainrewards/margins": 2.59375, "trainrewards/rejected": -0.89453125 }, { "epoch": 0.41, "grad_norm": 3.7684432316267347, "learning_rate": 3.6928969006490212e-06, "loss": 2.5578, "step": 55, "trainloss/critic_chosen": 1.105364441871643, "trainloss/critic_rejected": 1.1862692832946777, "trainloss/reward": 1.105364441871643, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 1.8046875, "trainrewards/margins": 2.65625, "trainrewards/rejected": -0.86328125 }, { "epoch": 0.42, "grad_norm": 2.733004985886796, "learning_rate": 3.634976249348867e-06, "loss": 2.5665, "step": 56, "trainloss/critic_chosen": 1.1256849765777588, "trainloss/critic_rejected": 1.1650742292404175, "trainloss/reward": 1.1256849765777588, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.328125, "trainrewards/margins": 2.390625, "trainrewards/rejected": -1.0546875 }, { "epoch": 0.42, "grad_norm": 3.063205301802556, "learning_rate": 3.5762777420207382e-06, "loss": 2.5733, "step": 57, "trainloss/critic_chosen": 1.1022924184799194, "trainloss/critic_rejected": 1.1559257507324219, "trainloss/reward": 1.1022924184799194, "trainrewards/accuracies": 0.9166666865348816, "trainrewards/chosen": 1.40625, "trainrewards/margins": 2.28125, "trainrewards/rejected": -0.875 }, { "epoch": 0.43, "grad_norm": 3.2936675397250985, "learning_rate": 3.516841607689501e-06, "loss": 2.529, "step": 58, "trainloss/critic_chosen": 1.0919924974441528, "trainloss/critic_rejected": 1.1807957887649536, "trainloss/reward": 1.0919924974441528, "trainrewards/accuracies": 0.9375000596046448, "trainrewards/chosen": 1.0703125, "trainrewards/margins": 2.0625, "trainrewards/rejected": -1.0 }, { "epoch": 0.44, "grad_norm": 2.9687788874925505, "learning_rate": 3.4567085809127247e-06, "loss": 2.5538, "step": 59, "trainloss/critic_chosen": 1.152530312538147, "trainloss/critic_rejected": 1.128198504447937, "trainloss/reward": 1.152530312538147, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.3125, "trainrewards/margins": 2.171875, "trainrewards/rejected": -0.86328125 }, { "epoch": 0.45, "grad_norm": 2.5189366946202374, "learning_rate": 3.39591987386325e-06, "loss": 2.4931, "step": 60, "trainloss/critic_chosen": 1.0971665382385254, "trainloss/critic_rejected": 1.189927339553833, "trainloss/reward": 1.0971665382385254, "trainrewards/accuracies": 0.96875, "trainrewards/chosen": 1.3828125, "trainrewards/margins": 2.671875, "trainrewards/rejected": -1.2890625 }, { "epoch": 0.45, "grad_norm": 4.707774798123127, "learning_rate": 3.3345171480844275e-06, "loss": 2.4995, "step": 61, "trainloss/critic_chosen": 1.1144541501998901, "trainloss/critic_rejected": 1.1472208499908447, "trainloss/reward": 1.1144541501998901, "trainrewards/accuracies": 0.9739583730697632, "trainrewards/chosen": 1.9921875, "trainrewards/margins": 2.765625, "trainrewards/rejected": -0.7734375 }, { "epoch": 0.46, "grad_norm": 3.621977923726089, "learning_rate": 3.272542485937369e-06, "loss": 2.5767, "step": 62, "trainloss/critic_chosen": 1.1388683319091797, "trainloss/critic_rejected": 1.1852062940597534, "trainloss/reward": 1.1388683319091797, "trainrewards/accuracies": 0.9479167461395264, "trainrewards/chosen": 1.8203125, "trainrewards/margins": 3.09375, "trainrewards/rejected": -1.265625 }, { "epoch": 0.47, "grad_norm": 4.340502288849219, "learning_rate": 3.2100383617598075e-06, "loss": 2.5008, "step": 63, "trainloss/critic_chosen": 1.0960522890090942, "trainloss/critic_rejected": 1.1389869451522827, "trainloss/reward": 1.0960522890090942, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.25, "trainrewards/margins": 2.8125, "trainrewards/rejected": -1.5703125 }, { "epoch": 0.48, "grad_norm": 3.2652013602478087, "learning_rate": 3.147047612756302e-06, "loss": 2.4784, "step": 64, "trainloss/critic_chosen": 1.1066646575927734, "trainloss/critic_rejected": 1.1423835754394531, "trainloss/reward": 1.1066646575927734, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.2265625, "trainrewards/margins": 2.859375, "trainrewards/rejected": -1.6328125 }, { "epoch": 0.48, "grad_norm": 4.460312181878758, "learning_rate": 3.0836134096397642e-06, "loss": 2.5315, "step": 65, "trainloss/critic_chosen": 1.097680926322937, "trainloss/critic_rejected": 1.1829330921173096, "trainloss/reward": 1.097680926322937, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.71875, "trainrewards/margins": 2.375, "trainrewards/rejected": -0.66015625 }, { "epoch": 0.49, "grad_norm": 5.398290397831798, "learning_rate": 3.019779227044398e-06, "loss": 2.4912, "step": 66, "trainloss/critic_chosen": 1.0728169679641724, "trainloss/critic_rejected": 1.1528609991073608, "trainloss/reward": 1.0728169679641724, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.75, "trainrewards/margins": 2.1875, "trainrewards/rejected": -0.44140625 }, { "epoch": 0.5, "grad_norm": 4.530365049006353, "learning_rate": 2.9555888137303695e-06, "loss": 2.4768, "step": 67, "trainloss/critic_chosen": 1.0978233814239502, "trainloss/critic_rejected": 1.1454182863235474, "trainloss/reward": 1.0978233814239502, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.515625, "trainrewards/margins": 2.1875, "trainrewards/rejected": -0.66015625 }, { "epoch": 0.51, "grad_norm": 3.090064735833262, "learning_rate": 2.8910861626005774e-06, "loss": 2.5542, "step": 68, "trainloss/critic_chosen": 1.1045993566513062, "trainloss/critic_rejected": 1.1823933124542236, "trainloss/reward": 1.1045993566513062, "trainrewards/accuracies": 0.9166666865348816, "trainrewards/chosen": 1.296875, "trainrewards/margins": 2.296875, "trainrewards/rejected": -1.0 }, { "epoch": 0.51, "grad_norm": 2.801294778929006, "learning_rate": 2.82631548055013e-06, "loss": 2.4752, "step": 69, "trainloss/critic_chosen": 1.0862737894058228, "trainloss/critic_rejected": 1.1638906002044678, "trainloss/reward": 1.0862737894058228, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.46875, "trainrewards/margins": 2.8125, "trainrewards/rejected": -1.359375 }, { "epoch": 0.52, "grad_norm": 3.5888770327583503, "learning_rate": 2.761321158169134e-06, "loss": 2.5502, "step": 70, "trainloss/critic_chosen": 1.1130059957504272, "trainloss/critic_rejected": 1.1747164726257324, "trainloss/reward": 1.1130059957504272, "trainrewards/accuracies": 0.9583333134651184, "trainrewards/chosen": 1.75, "trainrewards/margins": 2.953125, "trainrewards/rejected": -1.203125 }, { "epoch": 0.53, "grad_norm": 3.553005435624982, "learning_rate": 2.696147739319613e-06, "loss": 2.4735, "step": 71, "trainloss/critic_chosen": 1.1133400201797485, "trainloss/critic_rejected": 1.1409944295883179, "trainloss/reward": 1.1133400201797485, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.96875, "trainrewards/margins": 3.375, "trainrewards/rejected": -1.40625 }, { "epoch": 0.54, "grad_norm": 2.7088469336528145, "learning_rate": 2.6308398906073603e-06, "loss": 2.4512, "step": 72, "trainloss/critic_chosen": 1.1119564771652222, "trainloss/critic_rejected": 1.1244186162948608, "trainloss/reward": 1.1119564771652222, "trainrewards/accuracies": 0.96875, "trainrewards/chosen": 1.5703125, "trainrewards/margins": 3.03125, "trainrewards/rejected": -1.4609375 }, { "epoch": 0.54, "grad_norm": 3.938561115333166, "learning_rate": 2.5654423707696834e-06, "loss": 2.4921, "step": 73, "trainloss/critic_chosen": 1.0844348669052124, "trainloss/critic_rejected": 1.163710355758667, "trainloss/reward": 1.0844348669052124, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.0703125, "trainrewards/margins": 2.734375, "trainrewards/rejected": -1.6640625 }, { "epoch": 0.55, "grad_norm": 3.7076560975513293, "learning_rate": 2.5e-06, "loss": 2.4702, "step": 74, "trainloss/critic_chosen": 1.105428695678711, "trainloss/critic_rejected": 1.1134750843048096, "trainloss/reward": 1.105428695678711, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.1015625, "trainrewards/margins": 2.4375, "trainrewards/rejected": -1.328125 }, { "epoch": 0.56, "grad_norm": 4.584325275815331, "learning_rate": 2.434557629230318e-06, "loss": 2.5531, "step": 75, "trainloss/critic_chosen": 1.1023496389389038, "trainloss/critic_rejected": 1.1693300008773804, "trainloss/reward": 1.1023496389389038, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.6953125, "trainrewards/margins": 2.265625, "trainrewards/rejected": -0.5703125 }, { "epoch": 0.57, "grad_norm": 5.707921133643401, "learning_rate": 2.3691601093926406e-06, "loss": 2.512, "step": 76, "trainloss/critic_chosen": 1.0742218494415283, "trainloss/critic_rejected": 1.1473562717437744, "trainloss/reward": 1.0742218494415283, "trainrewards/accuracies": 0.9375000596046448, "trainrewards/chosen": 1.984375, "trainrewards/margins": 2.359375, "trainrewards/rejected": -0.380859375 }, { "epoch": 0.57, "grad_norm": 5.052893345106084, "learning_rate": 2.3038522606803882e-06, "loss": 2.5495, "step": 77, "trainloss/critic_chosen": 1.09754478931427, "trainloss/critic_rejected": 1.175227165222168, "trainloss/reward": 1.09754478931427, "trainrewards/accuracies": 0.9218751192092896, "trainrewards/chosen": 1.8671875, "trainrewards/margins": 2.359375, "trainrewards/rejected": -0.490234375 }, { "epoch": 0.58, "grad_norm": 3.505818483136781, "learning_rate": 2.238678841830867e-06, "loss": 2.5073, "step": 78, "trainloss/critic_chosen": 1.100816249847412, "trainloss/critic_rejected": 1.1553771495819092, "trainloss/reward": 1.100816249847412, "trainrewards/accuracies": 0.9375000596046448, "trainrewards/chosen": 1.4375, "trainrewards/margins": 2.1875, "trainrewards/rejected": -0.75 }, { "epoch": 0.59, "grad_norm": 4.2251215117971475, "learning_rate": 2.173684519449872e-06, "loss": 2.5035, "step": 79, "trainloss/critic_chosen": 1.093074083328247, "trainloss/critic_rejected": 1.163825511932373, "trainloss/reward": 1.093074083328247, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 0.91796875, "trainrewards/margins": 2.140625, "trainrewards/rejected": -1.21875 }, { "epoch": 0.6, "grad_norm": 4.171916933286059, "learning_rate": 2.1089138373994226e-06, "loss": 2.4726, "step": 80, "trainloss/critic_chosen": 1.0706841945648193, "trainloss/critic_rejected": 1.160952091217041, "trainloss/reward": 1.0706841945648193, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 0.98046875, "trainrewards/margins": 2.375, "trainrewards/rejected": -1.390625 }, { "epoch": 0.6, "grad_norm": 2.7690433360924085, "learning_rate": 2.0444111862696313e-06, "loss": 2.4269, "step": 81, "trainloss/critic_chosen": 1.0752573013305664, "trainloss/critic_rejected": 1.1339901685714722, "trainloss/reward": 1.0752573013305664, "trainrewards/accuracies": 0.9739583730697632, "trainrewards/chosen": 1.484375, "trainrewards/margins": 2.578125, "trainrewards/rejected": -1.09375 }, { "epoch": 0.61, "grad_norm": 3.358268001716196, "learning_rate": 1.9802207729556023e-06, "loss": 2.461, "step": 82, "trainloss/critic_chosen": 1.1075457334518433, "trainloss/critic_rejected": 1.1157523393630981, "trainloss/reward": 1.1075457334518433, "trainrewards/accuracies": 0.953125, "trainrewards/chosen": 1.8828125, "trainrewards/margins": 2.90625, "trainrewards/rejected": -1.0234375 }, { "epoch": 0.62, "grad_norm": 4.328068525423629, "learning_rate": 1.9163865903602374e-06, "loss": 2.5352, "step": 83, "trainloss/critic_chosen": 1.1028249263763428, "trainloss/critic_rejected": 1.1644842624664307, "trainloss/reward": 1.1028249263763428, "trainrewards/accuracies": 0.96875, "trainrewards/chosen": 1.8671875, "trainrewards/margins": 2.921875, "trainrewards/rejected": -1.0625 }, { "epoch": 0.63, "grad_norm": 3.266438978334478, "learning_rate": 1.852952387243698e-06, "loss": 2.4134, "step": 84, "trainloss/critic_chosen": 1.0756388902664185, "trainloss/critic_rejected": 1.1303694248199463, "trainloss/reward": 1.0756388902664185, "trainrewards/accuracies": 0.9687500596046448, "trainrewards/chosen": 1.9140625, "trainrewards/margins": 3.25, "trainrewards/rejected": -1.328125 }, { "epoch": 0.63, "grad_norm": 2.386641393706194, "learning_rate": 1.7899616382401935e-06, "loss": 2.401, "step": 85, "trainloss/critic_chosen": 1.0511287450790405, "trainloss/critic_rejected": 1.128703236579895, "trainloss/reward": 1.0511287450790405, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.6015625, "trainrewards/margins": 2.953125, "trainrewards/rejected": -1.359375 }, { "epoch": 0.64, "grad_norm": 3.7161933000807403, "learning_rate": 1.7274575140626318e-06, "loss": 2.4732, "step": 86, "trainloss/critic_chosen": 1.0827696323394775, "trainloss/critic_rejected": 1.1439146995544434, "trainloss/reward": 1.0827696323394775, "trainrewards/accuracies": 0.9583333134651184, "trainrewards/chosen": 1.0, "trainrewards/margins": 2.734375, "trainrewards/rejected": -1.734375 }, { "epoch": 0.65, "grad_norm": 3.4186216283012754, "learning_rate": 1.665482851915573e-06, "loss": 2.5064, "step": 87, "trainloss/critic_chosen": 1.093652367591858, "trainloss/critic_rejected": 1.1373913288116455, "trainloss/reward": 1.093652367591858, "trainrewards/accuracies": 0.927083432674408, "trainrewards/chosen": 1.09375, "trainrewards/margins": 2.5625, "trainrewards/rejected": -1.46875 }, { "epoch": 0.66, "grad_norm": 2.4263959266567996, "learning_rate": 1.6040801261367494e-06, "loss": 2.5409, "step": 88, "trainloss/critic_chosen": 1.1319228410720825, "trainloss/critic_rejected": 1.1887366771697998, "trainloss/reward": 1.1319228410720825, "trainrewards/accuracies": 0.9687501192092896, "trainrewards/chosen": 1.3125, "trainrewards/margins": 2.6875, "trainrewards/rejected": -1.375 }, { "epoch": 0.66, "grad_norm": 4.091003192293857, "learning_rate": 1.5432914190872757e-06, "loss": 2.5386, "step": 89, "trainloss/critic_chosen": 1.1037051677703857, "trainloss/critic_rejected": 1.1342533826828003, "trainloss/reward": 1.1037051677703857, "trainrewards/accuracies": 0.9427083134651184, "trainrewards/chosen": 1.640625, "trainrewards/margins": 2.375, "trainrewards/rejected": -0.734375 }, { "epoch": 0.67, "grad_norm": 4.356596196020246, "learning_rate": 1.4831583923105e-06, "loss": 2.4845, "step": 90, "trainloss/critic_chosen": 1.0889127254486084, "trainloss/critic_rejected": 1.1599314212799072, "trainloss/reward": 1.0889127254486084, "trainrewards/accuracies": 0.9583333134651184, "trainrewards/chosen": 1.875, "trainrewards/margins": 2.59375, "trainrewards/rejected": -0.7265625 }, { "epoch": 0.68, "grad_norm": 3.484859150407605, "learning_rate": 1.4237222579792618e-06, "loss": 2.504, "step": 91, "trainloss/critic_chosen": 1.1031081676483154, "trainloss/critic_rejected": 1.1596983671188354, "trainloss/reward": 1.1031081676483154, "trainrewards/accuracies": 0.953125, "trainrewards/chosen": 1.7265625, "trainrewards/margins": 2.5, "trainrewards/rejected": -0.765625 }, { "epoch": 0.69, "grad_norm": 3.5906077474254046, "learning_rate": 1.3650237506511333e-06, "loss": 2.497, "step": 92, "trainloss/critic_chosen": 1.1017568111419678, "trainloss/critic_rejected": 1.1597734689712524, "trainloss/reward": 1.1017568111419678, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.734375, "trainrewards/margins": 2.609375, "trainrewards/rejected": -0.87109375 }, { "epoch": 0.69, "grad_norm": 3.883326754801315, "learning_rate": 1.307103099350979e-06, "loss": 2.4881, "step": 93, "trainloss/critic_chosen": 1.1008602380752563, "trainloss/critic_rejected": 1.1622505187988281, "trainloss/reward": 1.1008602380752563, "trainrewards/accuracies": 0.9374999403953552, "trainrewards/chosen": 1.8359375, "trainrewards/margins": 2.65625, "trainrewards/rejected": -0.81640625 }, { "epoch": 0.7, "grad_norm": 3.106807473961497, "learning_rate": 1.2500000000000007e-06, "loss": 2.5239, "step": 94, "trainloss/critic_chosen": 1.1186132431030273, "trainloss/critic_rejected": 1.1955691576004028, "trainloss/reward": 1.1186132431030273, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.4453125, "trainrewards/margins": 2.78125, "trainrewards/rejected": -1.34375 }, { "epoch": 0.71, "grad_norm": 3.0694983477589237, "learning_rate": 1.193753588210128e-06, "loss": 2.4975, "step": 95, "trainloss/critic_chosen": 1.089274287223816, "trainloss/critic_rejected": 1.1611120700836182, "trainloss/reward": 1.089274287223816, "trainrewards/accuracies": 0.9166667461395264, "trainrewards/chosen": 1.21875, "trainrewards/margins": 2.625, "trainrewards/rejected": -1.4140625 }, { "epoch": 0.72, "grad_norm": 2.647849041797858, "learning_rate": 1.1384024124624324e-06, "loss": 2.4533, "step": 96, "trainloss/critic_chosen": 1.0731533765792847, "trainloss/critic_rejected": 1.1588420867919922, "trainloss/reward": 1.0731533765792847, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.2578125, "trainrewards/margins": 2.671875, "trainrewards/rejected": -1.40625 }, { "epoch": 0.72, "grad_norm": 3.0284092019206743, "learning_rate": 1.0839844076879186e-06, "loss": 2.52, "step": 97, "trainloss/critic_chosen": 1.1046061515808105, "trainloss/critic_rejected": 1.1355366706848145, "trainloss/reward": 1.1046061515808105, "trainrewards/accuracies": 0.9114583134651184, "trainrewards/chosen": 1.5234375, "trainrewards/margins": 2.515625, "trainrewards/rejected": -1.0 }, { "epoch": 0.73, "grad_norm": 3.239930376341791, "learning_rate": 1.0305368692688175e-06, "loss": 2.3829, "step": 98, "trainloss/critic_chosen": 1.0649518966674805, "trainloss/critic_rejected": 1.1148179769515991, "trainloss/reward": 1.0649518966674805, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.828125, "trainrewards/margins": 2.921875, "trainrewards/rejected": -1.0859375 }, { "epoch": 0.74, "grad_norm": 2.844057334430093, "learning_rate": 9.780964274781984e-07, "loss": 2.4761, "step": 99, "trainloss/critic_chosen": 1.0876730680465698, "trainloss/critic_rejected": 1.1597809791564941, "trainloss/reward": 1.0876730680465698, "trainrewards/accuracies": 0.9583333134651184, "trainrewards/chosen": 1.65625, "trainrewards/margins": 2.703125, "trainrewards/rejected": -1.046875 }, { "epoch": 0.74, "grad_norm": 2.523374395571222, "learning_rate": 9.266990223754069e-07, "loss": 2.4511, "step": 100, "trainloss/critic_chosen": 1.0983867645263672, "trainloss/critic_rejected": 1.1455085277557373, "trainloss/reward": 1.0983867645263672, "trainrewards/accuracies": 0.9791666865348816, "trainrewards/chosen": 1.5546875, "trainrewards/margins": 2.78125, "trainrewards/rejected": -1.21875 }, { "epoch": 0.75, "grad_norm": 3.3175745581436917, "learning_rate": 8.763798791745413e-07, "loss": 2.453, "step": 101, "trainloss/critic_chosen": 1.094862699508667, "trainloss/critic_rejected": 1.1401726007461548, "trainloss/reward": 1.094862699508667, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.625, "trainrewards/margins": 2.78125, "trainrewards/rejected": -1.15625 }, { "epoch": 0.76, "grad_norm": 2.8366252126004596, "learning_rate": 8.271734841028553e-07, "loss": 2.5483, "step": 102, "trainloss/critic_chosen": 1.0930631160736084, "trainloss/critic_rejected": 1.173164963722229, "trainloss/reward": 1.0930631160736084, "trainrewards/accuracies": 0.8958333134651184, "trainrewards/chosen": 1.390625, "trainrewards/margins": 2.515625, "trainrewards/rejected": -1.125 }, { "epoch": 0.77, "grad_norm": 2.9707853433568023, "learning_rate": 7.791135607656147e-07, "loss": 2.3986, "step": 103, "trainloss/critic_chosen": 1.0701719522476196, "trainloss/critic_rejected": 1.1288470029830933, "trainloss/reward": 1.0701719522476196, "trainrewards/accuracies": 0.9791667461395264, "trainrewards/chosen": 1.6328125, "trainrewards/margins": 2.765625, "trainrewards/rejected": -1.1328125 }, { "epoch": 0.77, "grad_norm": 3.309757754883857, "learning_rate": 7.322330470336314e-07, "loss": 2.429, "step": 104, "trainloss/critic_chosen": 1.0845046043395996, "trainloss/critic_rejected": 1.1261361837387085, "trainloss/reward": 1.0845046043395996, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.6796875, "trainrewards/margins": 2.71875, "trainrewards/rejected": -1.0234375 }, { "epoch": 0.78, "grad_norm": 2.4159819079210556, "learning_rate": 6.865640724692815e-07, "loss": 2.3868, "step": 105, "trainloss/critic_chosen": 1.0498684644699097, "trainloss/critic_rejected": 1.131639003753662, "trainloss/reward": 1.0498684644699097, "trainrewards/accuracies": 0.9687500596046448, "trainrewards/chosen": 1.5, "trainrewards/margins": 2.90625, "trainrewards/rejected": -1.3984375 }, { "epoch": 0.79, "grad_norm": 2.630877225161229, "learning_rate": 6.421379363065142e-07, "loss": 2.4745, "step": 106, "trainloss/critic_chosen": 1.0781538486480713, "trainloss/critic_rejected": 1.1649324893951416, "trainloss/reward": 1.0781538486480713, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.5, "trainrewards/margins": 2.78125, "trainrewards/rejected": -1.28125 }, { "epoch": 0.8, "grad_norm": 2.71300394057213, "learning_rate": 5.989850859999227e-07, "loss": 2.4433, "step": 107, "trainloss/critic_chosen": 1.0875132083892822, "trainloss/critic_rejected": 1.1300991773605347, "trainloss/reward": 1.0875132083892822, "trainrewards/accuracies": 0.9635416865348816, "trainrewards/chosen": 1.4140625, "trainrewards/margins": 3.109375, "trainrewards/rejected": -1.703125 }, { "epoch": 0.8, "grad_norm": 2.722489376200587, "learning_rate": 5.571350963575728e-07, "loss": 2.467, "step": 108, "trainloss/critic_chosen": 1.0709630250930786, "trainloss/critic_rejected": 1.154178500175476, "trainloss/reward": 1.0709630250930786, "trainrewards/accuracies": 0.9479166865348816, "trainrewards/chosen": 1.359375, "trainrewards/margins": 2.859375, "trainrewards/rejected": -1.5 }, { "epoch": 0.81, "grad_norm": 3.255161744830997, "learning_rate": 5.166166492719124e-07, "loss": 2.4854, "step": 109, "trainloss/critic_chosen": 1.1081310510635376, "trainloss/critic_rejected": 1.152534008026123, "trainloss/reward": 1.1081310510635376, "trainrewards/accuracies": 0.973958432674408, "trainrewards/chosen": 1.34375, "trainrewards/margins": 2.96875, "trainrewards/rejected": -1.6328125 }, { "epoch": 0.82, "grad_norm": 2.762498507683836, "learning_rate": 4.774575140626317e-07, "loss": 2.4388, "step": 110, "trainloss/critic_chosen": 1.065203070640564, "trainloss/critic_rejected": 1.0969582796096802, "trainloss/reward": 1.065203070640564, "trainrewards/accuracies": 0.9635416269302368, "trainrewards/chosen": 1.3046875, "trainrewards/margins": 2.609375, "trainrewards/rejected": -1.3125 }, { "epoch": 0.83, "grad_norm": 2.780757216314426, "learning_rate": 4.396845284449608e-07, "loss": 2.4319, "step": 111, "trainloss/critic_chosen": 1.083713173866272, "trainloss/critic_rejected": 1.119750738143921, "trainloss/reward": 1.083713173866272, "trainrewards/accuracies": 0.9687500596046448, "trainrewards/chosen": 1.7421875, "trainrewards/margins": 3.03125, "trainrewards/rejected": -1.296875 }, { "epoch": 0.83, "grad_norm": 3.7107544004289323, "learning_rate": 4.033235801364402e-07, "loss": 2.4846, "step": 112, "trainloss/critic_chosen": 1.106475830078125, "trainloss/critic_rejected": 1.1211233139038086, "trainloss/reward": 1.106475830078125, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.7421875, "trainrewards/margins": 2.703125, "trainrewards/rejected": -0.96484375 }, { "epoch": 0.84, "grad_norm": 3.3073751739512787, "learning_rate": 3.683995891147696e-07, "loss": 2.4629, "step": 113, "trainloss/critic_chosen": 1.0521959066390991, "trainloss/critic_rejected": 1.173767328262329, "trainloss/reward": 1.0521959066390991, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.828125, "trainrewards/margins": 2.921875, "trainrewards/rejected": -1.0859375 }, { "epoch": 0.85, "grad_norm": 2.99774406823753, "learning_rate": 3.3493649053890325e-07, "loss": 2.536, "step": 114, "trainloss/critic_chosen": 1.1110682487487793, "trainloss/critic_rejected": 1.155356526374817, "trainloss/reward": 1.1110682487487793, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 1.5859375, "trainrewards/margins": 2.75, "trainrewards/rejected": -1.1640625 }, { "epoch": 0.86, "grad_norm": 3.513330205624271, "learning_rate": 3.0295721834508686e-07, "loss": 2.4707, "step": 115, "trainloss/critic_chosen": 1.0783016681671143, "trainloss/critic_rejected": 1.1236062049865723, "trainloss/reward": 1.0783016681671143, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.703125, "trainrewards/margins": 2.671875, "trainrewards/rejected": -0.97265625 }, { "epoch": 0.86, "grad_norm": 2.800803642232422, "learning_rate": 2.7248368952908055e-07, "loss": 2.4803, "step": 116, "trainloss/critic_chosen": 1.080200433731079, "trainloss/critic_rejected": 1.1515780687332153, "trainloss/reward": 1.080200433731079, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.5546875, "trainrewards/margins": 2.5625, "trainrewards/rejected": -1.0 }, { "epoch": 0.87, "grad_norm": 2.7889069352140585, "learning_rate": 2.43536789125349e-07, "loss": 2.4905, "step": 117, "trainloss/critic_chosen": 1.088797688484192, "trainloss/critic_rejected": 1.1520254611968994, "trainloss/reward": 1.088797688484192, "trainrewards/accuracies": 0.9375, "trainrewards/chosen": 1.5, "trainrewards/margins": 2.515625, "trainrewards/rejected": -1.0078125 }, { "epoch": 0.88, "grad_norm": 2.931939214492335, "learning_rate": 2.1613635589349756e-07, "loss": 2.3937, "step": 118, "trainloss/critic_chosen": 1.0556377172470093, "trainloss/critic_rejected": 1.1278637647628784, "trainloss/reward": 1.0556377172470093, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.4375, "trainrewards/margins": 2.46875, "trainrewards/rejected": -1.03125 }, { "epoch": 0.89, "grad_norm": 2.954190958849344, "learning_rate": 1.9030116872178317e-07, "loss": 2.418, "step": 119, "trainloss/critic_chosen": 1.0978080034255981, "trainloss/critic_rejected": 1.1148779392242432, "trainloss/reward": 1.0978080034255981, "trainrewards/accuracies": 0.9687500596046448, "trainrewards/chosen": 1.4453125, "trainrewards/margins": 2.46875, "trainrewards/rejected": -1.03125 }, { "epoch": 0.89, "grad_norm": 2.9483773523832353, "learning_rate": 1.6604893375699594e-07, "loss": 2.4694, "step": 120, "trainloss/critic_chosen": 1.1018942594528198, "trainloss/critic_rejected": 1.1370322704315186, "trainloss/reward": 1.1018942594528198, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.3515625, "trainrewards/margins": 2.40625, "trainrewards/rejected": -1.0546875 }, { "epoch": 0.9, "grad_norm": 2.9215978058037764, "learning_rate": 1.4339627226955394e-07, "loss": 2.4822, "step": 121, "trainloss/critic_chosen": 1.1052017211914062, "trainloss/critic_rejected": 1.148177981376648, "trainloss/reward": 1.1052017211914062, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.3515625, "trainrewards/margins": 2.515625, "trainrewards/rejected": -1.1640625 }, { "epoch": 0.91, "grad_norm": 2.8843923021301667, "learning_rate": 1.223587092621162e-07, "loss": 2.4942, "step": 122, "trainloss/critic_chosen": 1.0736005306243896, "trainloss/critic_rejected": 1.168089509010315, "trainloss/reward": 1.0736005306243896, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.3359375, "trainrewards/margins": 2.328125, "trainrewards/rejected": -0.99609375 }, { "epoch": 0.92, "grad_norm": 2.8724683106941193, "learning_rate": 1.0295066282951738e-07, "loss": 2.4881, "step": 123, "trainloss/critic_chosen": 1.09504234790802, "trainloss/critic_rejected": 1.1352362632751465, "trainloss/reward": 1.09504234790802, "trainrewards/accuracies": 0.9322916865348816, "trainrewards/chosen": 1.4375, "trainrewards/margins": 2.3125, "trainrewards/rejected": -0.87109375 }, { "epoch": 0.92, "grad_norm": 3.0064917280475045, "learning_rate": 8.518543427732951e-08, "loss": 2.5066, "step": 124, "trainloss/critic_chosen": 1.0965893268585205, "trainloss/critic_rejected": 1.12990403175354, "trainloss/reward": 1.0965893268585205, "trainrewards/accuracies": 0.9166667461395264, "trainrewards/chosen": 1.4375, "trainrewards/margins": 2.3125, "trainrewards/rejected": -0.8828125 }, { "epoch": 0.93, "grad_norm": 2.6882210161223425, "learning_rate": 6.907519900580862e-08, "loss": 2.3973, "step": 125, "trainloss/critic_chosen": 1.0724809169769287, "trainloss/critic_rejected": 1.1239736080169678, "trainloss/reward": 1.0724809169769287, "trainrewards/accuracies": 0.9687500596046448, "trainrewards/chosen": 1.546875, "trainrewards/margins": 2.5625, "trainrewards/rejected": -1.015625 }, { "epoch": 0.94, "grad_norm": 3.2130299463812233, "learning_rate": 5.463099816548578e-08, "loss": 2.4583, "step": 126, "trainloss/critic_chosen": 1.053167700767517, "trainloss/critic_rejected": 1.1157554388046265, "trainloss/reward": 1.053167700767517, "trainrewards/accuracies": 0.9270833730697632, "trainrewards/chosen": 1.390625, "trainrewards/margins": 2.171875, "trainrewards/rejected": -0.7890625 }, { "epoch": 0.95, "grad_norm": 2.5537231163007004, "learning_rate": 4.186273109011374e-08, "loss": 2.5432, "step": 127, "trainloss/critic_chosen": 1.1048160791397095, "trainloss/critic_rejected": 1.1709802150726318, "trainloss/reward": 1.1048160791397095, "trainrewards/accuracies": 0.9270833134651184, "trainrewards/chosen": 1.234375, "trainrewards/margins": 2.296875, "trainrewards/rejected": -1.0546875 }, { "epoch": 0.95, "grad_norm": 3.455571318987563, "learning_rate": 3.077914851215585e-08, "loss": 2.4356, "step": 128, "trainloss/critic_chosen": 1.0750889778137207, "trainloss/critic_rejected": 1.1610357761383057, "trainloss/reward": 1.0750889778137207, "trainrewards/accuracies": 0.9635416865348816, "trainrewards/chosen": 1.734375, "trainrewards/margins": 2.625, "trainrewards/rejected": -0.89453125 }, { "epoch": 0.96, "grad_norm": 3.095008653826813, "learning_rate": 2.1387846565474047e-08, "loss": 2.4339, "step": 129, "trainloss/critic_chosen": 1.0719571113586426, "trainloss/critic_rejected": 1.1504600048065186, "trainloss/reward": 1.0719571113586426, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.6328125, "trainrewards/margins": 2.59375, "trainrewards/rejected": -0.96484375 }, { "epoch": 0.97, "grad_norm": 3.014103190303491, "learning_rate": 1.3695261579316776e-08, "loss": 2.4359, "step": 130, "trainloss/critic_chosen": 1.0608913898468018, "trainloss/critic_rejected": 1.1623928546905518, "trainloss/reward": 1.0608913898468018, "trainrewards/accuracies": 0.9791667461395264, "trainrewards/chosen": 1.5078125, "trainrewards/margins": 2.453125, "trainrewards/rejected": -0.94921875 }, { "epoch": 0.98, "grad_norm": 3.5824668969231825, "learning_rate": 7.70666566718009e-09, "loss": 2.457, "step": 131, "trainloss/critic_chosen": 1.0644171237945557, "trainloss/critic_rejected": 1.1539928913116455, "trainloss/reward": 1.0644171237945557, "trainrewards/accuracies": 0.9583333730697632, "trainrewards/chosen": 1.578125, "trainrewards/margins": 2.5, "trainrewards/rejected": -0.91796875 }, { "epoch": 0.98, "grad_norm": 3.085175163660414, "learning_rate": 3.4261631135654174e-09, "loss": 2.4695, "step": 132, "trainloss/critic_chosen": 1.0733301639556885, "trainloss/critic_rejected": 1.1289600133895874, "trainloss/reward": 1.0733301639556885, "trainrewards/accuracies": 0.9427083730697632, "trainrewards/chosen": 1.484375, "trainrewards/margins": 2.296875, "trainrewards/rejected": -0.80859375 }, { "epoch": 0.99, "grad_norm": 2.6627177185366087, "learning_rate": 8.566875611068503e-10, "loss": 2.456, "step": 133, "trainloss/critic_chosen": 1.0969208478927612, "trainloss/critic_rejected": 1.1649024486541748, "trainloss/reward": 1.0969208478927612, "trainrewards/accuracies": 0.96875, "trainrewards/chosen": 1.4765625, "trainrewards/margins": 2.59375, "trainrewards/rejected": -1.1171875 }, { "epoch": 1.0, "grad_norm": 2.6690531080971973, "learning_rate": 0.0, "loss": 2.4519, "step": 134, "trainloss/critic_chosen": 1.090218186378479, "trainloss/critic_rejected": 1.128463625907898, "trainloss/reward": 1.090218186378479, "trainrewards/accuracies": 0.9531250596046448, "trainrewards/chosen": 1.5078125, "trainrewards/margins": 2.53125, "trainrewards/rejected": -1.0234375 }, { "epoch": 1.0, "step": 134, "total_flos": 0.0, "train_loss": 2.6233635464710976, "train_runtime": 32287.388, "train_samples_per_second": 0.799, "train_steps_per_second": 0.004 } ], "logging_steps": 1.0, "max_steps": 134, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }