MM-RLHF-Reward-7B-llava-ov-qwen / trainer_state.json
yifanzhang114's picture
Upload trainer_state.json with huggingface_hub
d0b0080 verified
raw
history blame
65.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9981378026070763,
"eval_steps": 500,
"global_step": 134,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 20.656896651231627,
"learning_rate": 3.5714285714285716e-07,
"loss": 3.6136,
"step": 1,
"trainloss/critic_chosen": 1.459133505821228,
"trainloss/critic_rejected": 1.468864917755127,
"trainloss/reward": 1.459133505821228,
"trainrewards/accuracies": 0.5833333134651184,
"trainrewards/chosen": 0.3359375,
"trainrewards/margins": 0.0308837890625,
"trainrewards/rejected": 0.3046875
},
{
"epoch": 0.01,
"grad_norm": 20.747827742934472,
"learning_rate": 7.142857142857143e-07,
"loss": 3.6381,
"step": 2,
"trainloss/critic_chosen": 1.4447739124298096,
"trainloss/critic_rejected": 1.4999535083770752,
"trainloss/reward": 1.4447739124298096,
"trainrewards/accuracies": 0.5104166865348816,
"trainrewards/chosen": 0.314453125,
"trainrewards/margins": 0.01531982421875,
"trainrewards/rejected": 0.298828125
},
{
"epoch": 0.02,
"grad_norm": 19.429850511676147,
"learning_rate": 1.0714285714285714e-06,
"loss": 3.6713,
"step": 3,
"trainloss/critic_chosen": 1.4738179445266724,
"trainloss/critic_rejected": 1.505049228668213,
"trainloss/reward": 1.4738179445266724,
"trainrewards/accuracies": 0.5364583134651184,
"trainrewards/chosen": 0.302734375,
"trainrewards/margins": 0.015869140625,
"trainrewards/rejected": 0.287109375
},
{
"epoch": 0.03,
"grad_norm": 19.353394477551088,
"learning_rate": 1.4285714285714286e-06,
"loss": 3.6414,
"step": 4,
"trainloss/critic_chosen": 1.4593632221221924,
"trainloss/critic_rejected": 1.4944238662719727,
"trainloss/reward": 1.4593632221221924,
"trainrewards/accuracies": 0.5572916865348816,
"trainrewards/chosen": 0.357421875,
"trainrewards/margins": 0.0286865234375,
"trainrewards/rejected": 0.328125
},
{
"epoch": 0.04,
"grad_norm": 19.314515274756275,
"learning_rate": 1.7857142857142859e-06,
"loss": 3.6014,
"step": 5,
"trainloss/critic_chosen": 1.4528993368148804,
"trainloss/critic_rejected": 1.5066261291503906,
"trainloss/reward": 1.4528993368148804,
"trainrewards/accuracies": 0.7760417461395264,
"trainrewards/chosen": 0.494140625,
"trainrewards/margins": 0.130859375,
"trainrewards/rejected": 0.36328125
},
{
"epoch": 0.04,
"grad_norm": 17.65488892011503,
"learning_rate": 2.1428571428571427e-06,
"loss": 3.5132,
"step": 6,
"trainloss/critic_chosen": 1.4163868427276611,
"trainloss/critic_rejected": 1.4701489210128784,
"trainloss/reward": 1.4163868427276611,
"trainrewards/accuracies": 0.7864583730697632,
"trainrewards/chosen": 0.54296875,
"trainrewards/margins": 0.169921875,
"trainrewards/rejected": 0.373046875
},
{
"epoch": 0.05,
"grad_norm": 14.922878013160528,
"learning_rate": 2.5e-06,
"loss": 3.4609,
"step": 7,
"trainloss/critic_chosen": 1.4025382995605469,
"trainloss/critic_rejected": 1.4922353029251099,
"trainloss/reward": 1.4025382995605469,
"trainrewards/accuracies": 0.8281250596046448,
"trainrewards/chosen": 0.9453125,
"trainrewards/margins": 0.376953125,
"trainrewards/rejected": 0.56640625
},
{
"epoch": 0.06,
"grad_norm": 14.232594419094823,
"learning_rate": 2.8571428571428573e-06,
"loss": 3.3368,
"step": 8,
"trainloss/critic_chosen": 1.3685592412948608,
"trainloss/critic_rejected": 1.4283447265625,
"trainloss/reward": 1.3685592412948608,
"trainrewards/accuracies": 0.8645833730697632,
"trainrewards/chosen": 1.0078125,
"trainrewards/margins": 0.47265625,
"trainrewards/rejected": 0.53515625
},
{
"epoch": 0.07,
"grad_norm": 12.480541507697524,
"learning_rate": 3.2142857142857147e-06,
"loss": 3.0686,
"step": 9,
"trainloss/critic_chosen": 1.3232171535491943,
"trainloss/critic_rejected": 1.3606585264205933,
"trainloss/reward": 1.3232171535491943,
"trainrewards/accuracies": 0.9114583134651184,
"trainrewards/chosen": 1.671875,
"trainrewards/margins": 1.3359375,
"trainrewards/rejected": 0.333984375
},
{
"epoch": 0.07,
"grad_norm": 8.25153148096521,
"learning_rate": 3.5714285714285718e-06,
"loss": 3.0194,
"step": 10,
"trainloss/critic_chosen": 1.289527177810669,
"trainloss/critic_rejected": 1.3465213775634766,
"trainloss/reward": 1.289527177810669,
"trainrewards/accuracies": 0.9010416865348816,
"trainrewards/chosen": 1.3125,
"trainrewards/margins": 1.4375,
"trainrewards/rejected": -0.1220703125
},
{
"epoch": 0.08,
"grad_norm": 15.573626174864643,
"learning_rate": 3.928571428571429e-06,
"loss": 2.9476,
"step": 11,
"trainloss/critic_chosen": 1.266632318496704,
"trainloss/critic_rejected": 1.3359544277191162,
"trainloss/reward": 1.266632318496704,
"trainrewards/accuracies": 0.9010417461395264,
"trainrewards/chosen": 0.208984375,
"trainrewards/margins": 2.25,
"trainrewards/rejected": -2.046875
},
{
"epoch": 0.09,
"grad_norm": 8.798738351150583,
"learning_rate": 4.2857142857142855e-06,
"loss": 2.9208,
"step": 12,
"trainloss/critic_chosen": 1.285549521446228,
"trainloss/critic_rejected": 1.336460828781128,
"trainloss/reward": 1.285549521446228,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 1.390625,
"trainrewards/margins": 2.453125,
"trainrewards/rejected": -1.0625
},
{
"epoch": 0.1,
"grad_norm": 10.920314695532706,
"learning_rate": 4.642857142857144e-06,
"loss": 2.9737,
"step": 13,
"trainloss/critic_chosen": 1.3031879663467407,
"trainloss/critic_rejected": 1.3538345098495483,
"trainloss/reward": 1.3031879663467407,
"trainrewards/accuracies": 0.9583333134651184,
"trainrewards/chosen": 1.9453125,
"trainrewards/margins": 2.203125,
"trainrewards/rejected": -0.255859375
},
{
"epoch": 0.1,
"grad_norm": 7.902411216600844,
"learning_rate": 5e-06,
"loss": 2.8568,
"step": 14,
"trainloss/critic_chosen": 1.2445811033248901,
"trainloss/critic_rejected": 1.3195788860321045,
"trainloss/reward": 1.2445811033248901,
"trainrewards/accuracies": 0.90625,
"trainrewards/chosen": 1.5078125,
"trainrewards/margins": 2.09375,
"trainrewards/rejected": -0.5859375
},
{
"epoch": 0.11,
"grad_norm": 11.497405609399582,
"learning_rate": 4.999143312438893e-06,
"loss": 2.8485,
"step": 15,
"trainloss/critic_chosen": 1.2575112581253052,
"trainloss/critic_rejected": 1.303661823272705,
"trainloss/reward": 1.2575112581253052,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 0.55078125,
"trainrewards/margins": 1.84375,
"trainrewards/rejected": -1.2890625
},
{
"epoch": 0.12,
"grad_norm": 8.767557307270321,
"learning_rate": 4.9965738368864345e-06,
"loss": 2.8737,
"step": 16,
"trainloss/critic_chosen": 1.2387288808822632,
"trainloss/critic_rejected": 1.3038721084594727,
"trainloss/reward": 1.2387288808822632,
"trainrewards/accuracies": 0.9010416865348816,
"trainrewards/chosen": 1.578125,
"trainrewards/margins": 2.59375,
"trainrewards/rejected": -1.015625
},
{
"epoch": 0.13,
"grad_norm": 8.749611939075479,
"learning_rate": 4.992293334332821e-06,
"loss": 2.8681,
"step": 17,
"trainloss/critic_chosen": 1.2373957633972168,
"trainloss/critic_rejected": 1.302640676498413,
"trainloss/reward": 1.2373957633972168,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.5859375,
"trainrewards/margins": 2.203125,
"trainrewards/rejected": -0.61328125
},
{
"epoch": 0.13,
"grad_norm": 8.484573172931489,
"learning_rate": 4.986304738420684e-06,
"loss": 2.8305,
"step": 18,
"trainloss/critic_chosen": 1.23964262008667,
"trainloss/critic_rejected": 1.297165870666504,
"trainloss/reward": 1.23964262008667,
"trainrewards/accuracies": 0.9166666865348816,
"trainrewards/chosen": 0.60546875,
"trainrewards/margins": 1.765625,
"trainrewards/rejected": -1.15625
},
{
"epoch": 0.14,
"grad_norm": 5.43676917020596,
"learning_rate": 4.978612153434527e-06,
"loss": 2.7193,
"step": 19,
"trainloss/critic_chosen": 1.2180960178375244,
"trainloss/critic_rejected": 1.2327347993850708,
"trainloss/reward": 1.2180960178375244,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.5546875,
"trainrewards/margins": 2.296875,
"trainrewards/rejected": -0.7421875
},
{
"epoch": 0.15,
"grad_norm": 5.651490595569425,
"learning_rate": 4.9692208514878445e-06,
"loss": 2.8528,
"step": 20,
"trainloss/critic_chosen": 1.2103852033615112,
"trainloss/critic_rejected": 1.3035379648208618,
"trainloss/reward": 1.2103852033615112,
"trainrewards/accuracies": 0.9062500596046448,
"trainrewards/chosen": 1.75,
"trainrewards/margins": 2.6875,
"trainrewards/rejected": -0.94921875
},
{
"epoch": 0.16,
"grad_norm": 5.1975082682744524,
"learning_rate": 4.958137268909887e-06,
"loss": 2.7287,
"step": 21,
"trainloss/critic_chosen": 1.1857198476791382,
"trainloss/critic_rejected": 1.2193048000335693,
"trainloss/reward": 1.1857198476791382,
"trainrewards/accuracies": 0.9114583730697632,
"trainrewards/chosen": 1.4296875,
"trainrewards/margins": 2.21875,
"trainrewards/rejected": -0.7890625
},
{
"epoch": 0.16,
"grad_norm": 5.466958288822169,
"learning_rate": 4.9453690018345144e-06,
"loss": 2.7514,
"step": 22,
"trainloss/critic_chosen": 1.1868751049041748,
"trainloss/critic_rejected": 1.256333827972412,
"trainloss/reward": 1.1868751049041748,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 0.953125,
"trainrewards/margins": 1.71875,
"trainrewards/rejected": -0.76171875
},
{
"epoch": 0.17,
"grad_norm": 4.795005616591591,
"learning_rate": 4.930924800994192e-06,
"loss": 2.7025,
"step": 23,
"trainloss/critic_chosen": 1.1841645240783691,
"trainloss/critic_rejected": 1.2626478672027588,
"trainloss/reward": 1.1841645240783691,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.0625,
"trainrewards/margins": 2.09375,
"trainrewards/rejected": -1.0390625
},
{
"epoch": 0.18,
"grad_norm": 7.216599935232891,
"learning_rate": 4.914814565722671e-06,
"loss": 2.7024,
"step": 24,
"trainloss/critic_chosen": 1.1600149869918823,
"trainloss/critic_rejected": 1.216670036315918,
"trainloss/reward": 1.1600149869918823,
"trainrewards/accuracies": 0.90625,
"trainrewards/chosen": 1.953125,
"trainrewards/margins": 2.53125,
"trainrewards/rejected": -0.578125
},
{
"epoch": 0.19,
"grad_norm": 5.574536341669933,
"learning_rate": 4.897049337170483e-06,
"loss": 2.6825,
"step": 25,
"trainloss/critic_chosen": 1.17496919631958,
"trainloss/critic_rejected": 1.2430278062820435,
"trainloss/reward": 1.17496919631958,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.84375,
"trainrewards/margins": 2.71875,
"trainrewards/rejected": -0.87109375
},
{
"epoch": 0.19,
"grad_norm": 8.130831144336229,
"learning_rate": 4.8776412907378845e-06,
"loss": 2.7403,
"step": 26,
"trainloss/critic_chosen": 1.1843974590301514,
"trainloss/critic_rejected": 1.2316250801086426,
"trainloss/reward": 1.1843974590301514,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 0.322265625,
"trainrewards/margins": 2.09375,
"trainrewards/rejected": -1.78125
},
{
"epoch": 0.2,
"grad_norm": 4.106462749941039,
"learning_rate": 4.856603727730446e-06,
"loss": 2.6318,
"step": 27,
"trainloss/critic_chosen": 1.1325989961624146,
"trainloss/critic_rejected": 1.2125966548919678,
"trainloss/reward": 1.1325989961624146,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 0.98828125,
"trainrewards/margins": 1.859375,
"trainrewards/rejected": -0.875
},
{
"epoch": 0.21,
"grad_norm": 7.501840024960186,
"learning_rate": 4.833951066243004e-06,
"loss": 2.7439,
"step": 28,
"trainloss/critic_chosen": 1.156808853149414,
"trainloss/critic_rejected": 1.218095064163208,
"trainloss/reward": 1.156808853149414,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 2.03125,
"trainrewards/margins": 2.0,
"trainrewards/rejected": 0.021240234375
},
{
"epoch": 0.22,
"grad_norm": 10.542645887143404,
"learning_rate": 4.809698831278217e-06,
"loss": 2.6949,
"step": 29,
"trainloss/critic_chosen": 1.146854043006897,
"trainloss/critic_rejected": 1.2151950597763062,
"trainloss/reward": 1.146854043006897,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 2.546875,
"trainrewards/margins": 2.375,
"trainrewards/rejected": 0.169921875
},
{
"epoch": 0.22,
"grad_norm": 5.112451478263716,
"learning_rate": 4.783863644106502e-06,
"loss": 2.6784,
"step": 30,
"trainloss/critic_chosen": 1.1554011106491089,
"trainloss/critic_rejected": 1.2330732345581055,
"trainloss/reward": 1.1554011106491089,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.6875,
"trainrewards/margins": 2.390625,
"trainrewards/rejected": -0.703125
},
{
"epoch": 0.23,
"grad_norm": 5.406168864359725,
"learning_rate": 4.7564632108746524e-06,
"loss": 2.716,
"step": 31,
"trainloss/critic_chosen": 1.162062168121338,
"trainloss/critic_rejected": 1.2383043766021729,
"trainloss/reward": 1.162062168121338,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 0.609375,
"trainrewards/margins": 1.9765625,
"trainrewards/rejected": -1.3671875
},
{
"epoch": 0.24,
"grad_norm": 5.067065444400827,
"learning_rate": 4.72751631047092e-06,
"loss": 2.6516,
"step": 32,
"trainloss/critic_chosen": 1.1599905490875244,
"trainloss/critic_rejected": 1.2166763544082642,
"trainloss/reward": 1.1599905490875244,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 0.7265625,
"trainrewards/margins": 2.21875,
"trainrewards/rejected": -1.4921875
},
{
"epoch": 0.25,
"grad_norm": 6.759001147106114,
"learning_rate": 4.697042781654913e-06,
"loss": 2.6586,
"step": 33,
"trainloss/critic_chosen": 1.1513490676879883,
"trainloss/critic_rejected": 1.1805285215377808,
"trainloss/reward": 1.1513490676879883,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.8203125,
"trainrewards/margins": 2.234375,
"trainrewards/rejected": -0.408203125
},
{
"epoch": 0.25,
"grad_norm": 7.671545596305826,
"learning_rate": 4.665063509461098e-06,
"loss": 2.6397,
"step": 34,
"trainloss/critic_chosen": 1.1355525255203247,
"trainloss/critic_rejected": 1.1824406385421753,
"trainloss/reward": 1.1355525255203247,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 2.15625,
"trainrewards/margins": 2.34375,
"trainrewards/rejected": -0.1923828125
},
{
"epoch": 0.26,
"grad_norm": 4.120967770831028,
"learning_rate": 4.631600410885231e-06,
"loss": 2.6941,
"step": 35,
"trainloss/critic_chosen": 1.1876243352890015,
"trainloss/critic_rejected": 1.2462928295135498,
"trainloss/reward": 1.1876243352890015,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.6640625,
"trainrewards/margins": 2.453125,
"trainrewards/rejected": -0.78125
},
{
"epoch": 0.27,
"grad_norm": 4.873851901547121,
"learning_rate": 4.596676419863561e-06,
"loss": 2.5644,
"step": 36,
"trainloss/critic_chosen": 1.1080451011657715,
"trainloss/critic_rejected": 1.1967337131500244,
"trainloss/reward": 1.1080451011657715,
"trainrewards/accuracies": 0.96875,
"trainrewards/chosen": 0.80078125,
"trainrewards/margins": 2.125,
"trainrewards/rejected": -1.328125
},
{
"epoch": 0.28,
"grad_norm": 3.9620697117121004,
"learning_rate": 4.560315471555039e-06,
"loss": 2.5956,
"step": 37,
"trainloss/critic_chosen": 1.1373480558395386,
"trainloss/critic_rejected": 1.2142869234085083,
"trainloss/reward": 1.1373480558395386,
"trainrewards/accuracies": 0.9375000596046448,
"trainrewards/chosen": 1.0234375,
"trainrewards/margins": 2.40625,
"trainrewards/rejected": -1.390625
},
{
"epoch": 0.28,
"grad_norm": 5.768390511994523,
"learning_rate": 4.522542485937369e-06,
"loss": 2.6888,
"step": 38,
"trainloss/critic_chosen": 1.1469529867172241,
"trainloss/critic_rejected": 1.2045722007751465,
"trainloss/reward": 1.1469529867172241,
"trainrewards/accuracies": 0.9114583730697632,
"trainrewards/chosen": 1.8046875,
"trainrewards/margins": 2.484375,
"trainrewards/rejected": -0.6875
},
{
"epoch": 0.29,
"grad_norm": 4.874609149891752,
"learning_rate": 4.4833833507280884e-06,
"loss": 2.5543,
"step": 39,
"trainloss/critic_chosen": 1.1098886728286743,
"trainloss/critic_rejected": 1.1714903116226196,
"trainloss/reward": 1.1098886728286743,
"trainrewards/accuracies": 0.958333432674408,
"trainrewards/chosen": 1.8984375,
"trainrewards/margins": 2.625,
"trainrewards/rejected": -0.7265625
},
{
"epoch": 0.3,
"grad_norm": 3.396284173013532,
"learning_rate": 4.442864903642428e-06,
"loss": 2.6564,
"step": 40,
"trainloss/critic_chosen": 1.1380069255828857,
"trainloss/critic_rejected": 1.2159972190856934,
"trainloss/reward": 1.1380069255828857,
"trainrewards/accuracies": 0.9427083134651184,
"trainrewards/chosen": 0.9296875,
"trainrewards/margins": 1.9375,
"trainrewards/rejected": -1.0078125
},
{
"epoch": 0.31,
"grad_norm": 3.8092153086439087,
"learning_rate": 4.401014914000078e-06,
"loss": 2.5515,
"step": 41,
"trainloss/critic_chosen": 1.123491883277893,
"trainloss/critic_rejected": 1.1983730792999268,
"trainloss/reward": 1.123491883277893,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.015625,
"trainrewards/margins": 2.125,
"trainrewards/rejected": -1.1015625
},
{
"epoch": 0.31,
"grad_norm": 3.292224209377405,
"learning_rate": 4.357862063693486e-06,
"loss": 2.6296,
"step": 42,
"trainloss/critic_chosen": 1.1296896934509277,
"trainloss/critic_rejected": 1.193892002105713,
"trainloss/reward": 1.1296896934509277,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.3671875,
"trainrewards/margins": 2.234375,
"trainrewards/rejected": -0.859375
},
{
"epoch": 0.32,
"grad_norm": 4.97304962284229,
"learning_rate": 4.313435927530719e-06,
"loss": 2.5984,
"step": 43,
"trainloss/critic_chosen": 1.106866478919983,
"trainloss/critic_rejected": 1.1839522123336792,
"trainloss/reward": 1.106866478919983,
"trainrewards/accuracies": 0.9166666865348816,
"trainrewards/chosen": 1.859375,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -0.6640625
},
{
"epoch": 0.33,
"grad_norm": 3.1437931542625615,
"learning_rate": 4.267766952966369e-06,
"loss": 2.6053,
"step": 44,
"trainloss/critic_chosen": 1.141026496887207,
"trainloss/critic_rejected": 1.1881659030914307,
"trainloss/reward": 1.141026496887207,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.484375,
"trainrewards/margins": 2.5,
"trainrewards/rejected": -1.015625
},
{
"epoch": 0.34,
"grad_norm": 3.0260425195721727,
"learning_rate": 4.220886439234385e-06,
"loss": 2.6162,
"step": 45,
"trainloss/critic_chosen": 1.1437909603118896,
"trainloss/critic_rejected": 1.1694350242614746,
"trainloss/reward": 1.1437909603118896,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.3359375,
"trainrewards/margins": 2.265625,
"trainrewards/rejected": -0.93359375
},
{
"epoch": 0.34,
"grad_norm": 3.9421991947992803,
"learning_rate": 4.172826515897146e-06,
"loss": 2.559,
"step": 46,
"trainloss/critic_chosen": 1.1193464994430542,
"trainloss/critic_rejected": 1.1624045372009277,
"trainloss/reward": 1.1193464994430542,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.2109375,
"trainrewards/margins": 1.96875,
"trainrewards/rejected": -0.75
},
{
"epoch": 0.35,
"grad_norm": 4.76800798471375,
"learning_rate": 4.123620120825459e-06,
"loss": 2.5633,
"step": 47,
"trainloss/critic_chosen": 1.1039447784423828,
"trainloss/critic_rejected": 1.1683855056762695,
"trainloss/reward": 1.1039447784423828,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.5,
"trainrewards/margins": 1.8515625,
"trainrewards/rejected": -0.357421875
},
{
"epoch": 0.36,
"grad_norm": 4.677899041279874,
"learning_rate": 4.073300977624594e-06,
"loss": 2.6104,
"step": 48,
"trainloss/critic_chosen": 1.1293267011642456,
"trainloss/critic_rejected": 1.173600435256958,
"trainloss/reward": 1.1293267011642456,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.5625,
"trainrewards/margins": 1.953125,
"trainrewards/rejected": -0.38671875
},
{
"epoch": 0.36,
"grad_norm": 2.8973280324668815,
"learning_rate": 4.021903572521802e-06,
"loss": 2.5884,
"step": 49,
"trainloss/critic_chosen": 1.1289738416671753,
"trainloss/critic_rejected": 1.169731855392456,
"trainloss/reward": 1.1289738416671753,
"trainrewards/accuracies": 0.9375000596046448,
"trainrewards/chosen": 1.3125,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -1.203125
},
{
"epoch": 0.37,
"grad_norm": 2.9211772383175685,
"learning_rate": 3.969463130731183e-06,
"loss": 2.5868,
"step": 50,
"trainloss/critic_chosen": 1.1367411613464355,
"trainloss/critic_rejected": 1.1725157499313354,
"trainloss/reward": 1.1367411613464355,
"trainrewards/accuracies": 0.9114583730697632,
"trainrewards/chosen": 1.265625,
"trainrewards/margins": 2.484375,
"trainrewards/rejected": -1.21875
},
{
"epoch": 0.38,
"grad_norm": 3.4239805909008627,
"learning_rate": 3.916015592312083e-06,
"loss": 2.5442,
"step": 51,
"trainloss/critic_chosen": 1.101191759109497,
"trainloss/critic_rejected": 1.2063257694244385,
"trainloss/reward": 1.101191759109497,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.6171875,
"trainrewards/margins": 2.546875,
"trainrewards/rejected": -0.9296875
},
{
"epoch": 0.39,
"grad_norm": 3.3449791279382155,
"learning_rate": 3.861597587537568e-06,
"loss": 2.5532,
"step": 52,
"trainloss/critic_chosen": 1.1064534187316895,
"trainloss/critic_rejected": 1.1979490518569946,
"trainloss/reward": 1.1064534187316895,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.6015625,
"trainrewards/margins": 2.46875,
"trainrewards/rejected": -0.875
},
{
"epoch": 0.39,
"grad_norm": 3.9419484082989724,
"learning_rate": 3.806246411789872e-06,
"loss": 2.6147,
"step": 53,
"trainloss/critic_chosen": 1.138405680656433,
"trainloss/critic_rejected": 1.1973538398742676,
"trainloss/reward": 1.138405680656433,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.25,
"trainrewards/margins": 2.609375,
"trainrewards/rejected": -1.3671875
},
{
"epoch": 0.4,
"grad_norm": 3.4980254593155413,
"learning_rate": 3.7500000000000005e-06,
"loss": 2.5364,
"step": 54,
"trainloss/critic_chosen": 1.0845965147018433,
"trainloss/critic_rejected": 1.1989306211471558,
"trainloss/reward": 1.0845965147018433,
"trainrewards/accuracies": 0.9635417461395264,
"trainrewards/chosen": 1.6953125,
"trainrewards/margins": 2.59375,
"trainrewards/rejected": -0.89453125
},
{
"epoch": 0.41,
"grad_norm": 3.7684432316267347,
"learning_rate": 3.6928969006490212e-06,
"loss": 2.5578,
"step": 55,
"trainloss/critic_chosen": 1.105364441871643,
"trainloss/critic_rejected": 1.1862692832946777,
"trainloss/reward": 1.105364441871643,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 1.8046875,
"trainrewards/margins": 2.65625,
"trainrewards/rejected": -0.86328125
},
{
"epoch": 0.42,
"grad_norm": 2.733004985886796,
"learning_rate": 3.634976249348867e-06,
"loss": 2.5665,
"step": 56,
"trainloss/critic_chosen": 1.1256849765777588,
"trainloss/critic_rejected": 1.1650742292404175,
"trainloss/reward": 1.1256849765777588,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.328125,
"trainrewards/margins": 2.390625,
"trainrewards/rejected": -1.0546875
},
{
"epoch": 0.42,
"grad_norm": 3.063205301802556,
"learning_rate": 3.5762777420207382e-06,
"loss": 2.5733,
"step": 57,
"trainloss/critic_chosen": 1.1022924184799194,
"trainloss/critic_rejected": 1.1559257507324219,
"trainloss/reward": 1.1022924184799194,
"trainrewards/accuracies": 0.9166666865348816,
"trainrewards/chosen": 1.40625,
"trainrewards/margins": 2.28125,
"trainrewards/rejected": -0.875
},
{
"epoch": 0.43,
"grad_norm": 3.2936675397250985,
"learning_rate": 3.516841607689501e-06,
"loss": 2.529,
"step": 58,
"trainloss/critic_chosen": 1.0919924974441528,
"trainloss/critic_rejected": 1.1807957887649536,
"trainloss/reward": 1.0919924974441528,
"trainrewards/accuracies": 0.9375000596046448,
"trainrewards/chosen": 1.0703125,
"trainrewards/margins": 2.0625,
"trainrewards/rejected": -1.0
},
{
"epoch": 0.44,
"grad_norm": 2.9687788874925505,
"learning_rate": 3.4567085809127247e-06,
"loss": 2.5538,
"step": 59,
"trainloss/critic_chosen": 1.152530312538147,
"trainloss/critic_rejected": 1.128198504447937,
"trainloss/reward": 1.152530312538147,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.3125,
"trainrewards/margins": 2.171875,
"trainrewards/rejected": -0.86328125
},
{
"epoch": 0.45,
"grad_norm": 2.5189366946202374,
"learning_rate": 3.39591987386325e-06,
"loss": 2.4931,
"step": 60,
"trainloss/critic_chosen": 1.0971665382385254,
"trainloss/critic_rejected": 1.189927339553833,
"trainloss/reward": 1.0971665382385254,
"trainrewards/accuracies": 0.96875,
"trainrewards/chosen": 1.3828125,
"trainrewards/margins": 2.671875,
"trainrewards/rejected": -1.2890625
},
{
"epoch": 0.45,
"grad_norm": 4.707774798123127,
"learning_rate": 3.3345171480844275e-06,
"loss": 2.4995,
"step": 61,
"trainloss/critic_chosen": 1.1144541501998901,
"trainloss/critic_rejected": 1.1472208499908447,
"trainloss/reward": 1.1144541501998901,
"trainrewards/accuracies": 0.9739583730697632,
"trainrewards/chosen": 1.9921875,
"trainrewards/margins": 2.765625,
"trainrewards/rejected": -0.7734375
},
{
"epoch": 0.46,
"grad_norm": 3.621977923726089,
"learning_rate": 3.272542485937369e-06,
"loss": 2.5767,
"step": 62,
"trainloss/critic_chosen": 1.1388683319091797,
"trainloss/critic_rejected": 1.1852062940597534,
"trainloss/reward": 1.1388683319091797,
"trainrewards/accuracies": 0.9479167461395264,
"trainrewards/chosen": 1.8203125,
"trainrewards/margins": 3.09375,
"trainrewards/rejected": -1.265625
},
{
"epoch": 0.47,
"grad_norm": 4.340502288849219,
"learning_rate": 3.2100383617598075e-06,
"loss": 2.5008,
"step": 63,
"trainloss/critic_chosen": 1.0960522890090942,
"trainloss/critic_rejected": 1.1389869451522827,
"trainloss/reward": 1.0960522890090942,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.25,
"trainrewards/margins": 2.8125,
"trainrewards/rejected": -1.5703125
},
{
"epoch": 0.48,
"grad_norm": 3.2652013602478087,
"learning_rate": 3.147047612756302e-06,
"loss": 2.4784,
"step": 64,
"trainloss/critic_chosen": 1.1066646575927734,
"trainloss/critic_rejected": 1.1423835754394531,
"trainloss/reward": 1.1066646575927734,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.2265625,
"trainrewards/margins": 2.859375,
"trainrewards/rejected": -1.6328125
},
{
"epoch": 0.48,
"grad_norm": 4.460312181878758,
"learning_rate": 3.0836134096397642e-06,
"loss": 2.5315,
"step": 65,
"trainloss/critic_chosen": 1.097680926322937,
"trainloss/critic_rejected": 1.1829330921173096,
"trainloss/reward": 1.097680926322937,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.71875,
"trainrewards/margins": 2.375,
"trainrewards/rejected": -0.66015625
},
{
"epoch": 0.49,
"grad_norm": 5.398290397831798,
"learning_rate": 3.019779227044398e-06,
"loss": 2.4912,
"step": 66,
"trainloss/critic_chosen": 1.0728169679641724,
"trainloss/critic_rejected": 1.1528609991073608,
"trainloss/reward": 1.0728169679641724,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.75,
"trainrewards/margins": 2.1875,
"trainrewards/rejected": -0.44140625
},
{
"epoch": 0.5,
"grad_norm": 4.530365049006353,
"learning_rate": 2.9555888137303695e-06,
"loss": 2.4768,
"step": 67,
"trainloss/critic_chosen": 1.0978233814239502,
"trainloss/critic_rejected": 1.1454182863235474,
"trainloss/reward": 1.0978233814239502,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.515625,
"trainrewards/margins": 2.1875,
"trainrewards/rejected": -0.66015625
},
{
"epoch": 0.51,
"grad_norm": 3.090064735833262,
"learning_rate": 2.8910861626005774e-06,
"loss": 2.5542,
"step": 68,
"trainloss/critic_chosen": 1.1045993566513062,
"trainloss/critic_rejected": 1.1823933124542236,
"trainloss/reward": 1.1045993566513062,
"trainrewards/accuracies": 0.9166666865348816,
"trainrewards/chosen": 1.296875,
"trainrewards/margins": 2.296875,
"trainrewards/rejected": -1.0
},
{
"epoch": 0.51,
"grad_norm": 2.801294778929006,
"learning_rate": 2.82631548055013e-06,
"loss": 2.4752,
"step": 69,
"trainloss/critic_chosen": 1.0862737894058228,
"trainloss/critic_rejected": 1.1638906002044678,
"trainloss/reward": 1.0862737894058228,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.46875,
"trainrewards/margins": 2.8125,
"trainrewards/rejected": -1.359375
},
{
"epoch": 0.52,
"grad_norm": 3.5888770327583503,
"learning_rate": 2.761321158169134e-06,
"loss": 2.5502,
"step": 70,
"trainloss/critic_chosen": 1.1130059957504272,
"trainloss/critic_rejected": 1.1747164726257324,
"trainloss/reward": 1.1130059957504272,
"trainrewards/accuracies": 0.9583333134651184,
"trainrewards/chosen": 1.75,
"trainrewards/margins": 2.953125,
"trainrewards/rejected": -1.203125
},
{
"epoch": 0.53,
"grad_norm": 3.553005435624982,
"learning_rate": 2.696147739319613e-06,
"loss": 2.4735,
"step": 71,
"trainloss/critic_chosen": 1.1133400201797485,
"trainloss/critic_rejected": 1.1409944295883179,
"trainloss/reward": 1.1133400201797485,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.96875,
"trainrewards/margins": 3.375,
"trainrewards/rejected": -1.40625
},
{
"epoch": 0.54,
"grad_norm": 2.7088469336528145,
"learning_rate": 2.6308398906073603e-06,
"loss": 2.4512,
"step": 72,
"trainloss/critic_chosen": 1.1119564771652222,
"trainloss/critic_rejected": 1.1244186162948608,
"trainloss/reward": 1.1119564771652222,
"trainrewards/accuracies": 0.96875,
"trainrewards/chosen": 1.5703125,
"trainrewards/margins": 3.03125,
"trainrewards/rejected": -1.4609375
},
{
"epoch": 0.54,
"grad_norm": 3.938561115333166,
"learning_rate": 2.5654423707696834e-06,
"loss": 2.4921,
"step": 73,
"trainloss/critic_chosen": 1.0844348669052124,
"trainloss/critic_rejected": 1.163710355758667,
"trainloss/reward": 1.0844348669052124,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.0703125,
"trainrewards/margins": 2.734375,
"trainrewards/rejected": -1.6640625
},
{
"epoch": 0.55,
"grad_norm": 3.7076560975513293,
"learning_rate": 2.5e-06,
"loss": 2.4702,
"step": 74,
"trainloss/critic_chosen": 1.105428695678711,
"trainloss/critic_rejected": 1.1134750843048096,
"trainloss/reward": 1.105428695678711,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.1015625,
"trainrewards/margins": 2.4375,
"trainrewards/rejected": -1.328125
},
{
"epoch": 0.56,
"grad_norm": 4.584325275815331,
"learning_rate": 2.434557629230318e-06,
"loss": 2.5531,
"step": 75,
"trainloss/critic_chosen": 1.1023496389389038,
"trainloss/critic_rejected": 1.1693300008773804,
"trainloss/reward": 1.1023496389389038,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.6953125,
"trainrewards/margins": 2.265625,
"trainrewards/rejected": -0.5703125
},
{
"epoch": 0.57,
"grad_norm": 5.707921133643401,
"learning_rate": 2.3691601093926406e-06,
"loss": 2.512,
"step": 76,
"trainloss/critic_chosen": 1.0742218494415283,
"trainloss/critic_rejected": 1.1473562717437744,
"trainloss/reward": 1.0742218494415283,
"trainrewards/accuracies": 0.9375000596046448,
"trainrewards/chosen": 1.984375,
"trainrewards/margins": 2.359375,
"trainrewards/rejected": -0.380859375
},
{
"epoch": 0.57,
"grad_norm": 5.052893345106084,
"learning_rate": 2.3038522606803882e-06,
"loss": 2.5495,
"step": 77,
"trainloss/critic_chosen": 1.09754478931427,
"trainloss/critic_rejected": 1.175227165222168,
"trainloss/reward": 1.09754478931427,
"trainrewards/accuracies": 0.9218751192092896,
"trainrewards/chosen": 1.8671875,
"trainrewards/margins": 2.359375,
"trainrewards/rejected": -0.490234375
},
{
"epoch": 0.58,
"grad_norm": 3.505818483136781,
"learning_rate": 2.238678841830867e-06,
"loss": 2.5073,
"step": 78,
"trainloss/critic_chosen": 1.100816249847412,
"trainloss/critic_rejected": 1.1553771495819092,
"trainloss/reward": 1.100816249847412,
"trainrewards/accuracies": 0.9375000596046448,
"trainrewards/chosen": 1.4375,
"trainrewards/margins": 2.1875,
"trainrewards/rejected": -0.75
},
{
"epoch": 0.59,
"grad_norm": 4.2251215117971475,
"learning_rate": 2.173684519449872e-06,
"loss": 2.5035,
"step": 79,
"trainloss/critic_chosen": 1.093074083328247,
"trainloss/critic_rejected": 1.163825511932373,
"trainloss/reward": 1.093074083328247,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 0.91796875,
"trainrewards/margins": 2.140625,
"trainrewards/rejected": -1.21875
},
{
"epoch": 0.6,
"grad_norm": 4.171916933286059,
"learning_rate": 2.1089138373994226e-06,
"loss": 2.4726,
"step": 80,
"trainloss/critic_chosen": 1.0706841945648193,
"trainloss/critic_rejected": 1.160952091217041,
"trainloss/reward": 1.0706841945648193,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 0.98046875,
"trainrewards/margins": 2.375,
"trainrewards/rejected": -1.390625
},
{
"epoch": 0.6,
"grad_norm": 2.7690433360924085,
"learning_rate": 2.0444111862696313e-06,
"loss": 2.4269,
"step": 81,
"trainloss/critic_chosen": 1.0752573013305664,
"trainloss/critic_rejected": 1.1339901685714722,
"trainloss/reward": 1.0752573013305664,
"trainrewards/accuracies": 0.9739583730697632,
"trainrewards/chosen": 1.484375,
"trainrewards/margins": 2.578125,
"trainrewards/rejected": -1.09375
},
{
"epoch": 0.61,
"grad_norm": 3.358268001716196,
"learning_rate": 1.9802207729556023e-06,
"loss": 2.461,
"step": 82,
"trainloss/critic_chosen": 1.1075457334518433,
"trainloss/critic_rejected": 1.1157523393630981,
"trainloss/reward": 1.1075457334518433,
"trainrewards/accuracies": 0.953125,
"trainrewards/chosen": 1.8828125,
"trainrewards/margins": 2.90625,
"trainrewards/rejected": -1.0234375
},
{
"epoch": 0.62,
"grad_norm": 4.328068525423629,
"learning_rate": 1.9163865903602374e-06,
"loss": 2.5352,
"step": 83,
"trainloss/critic_chosen": 1.1028249263763428,
"trainloss/critic_rejected": 1.1644842624664307,
"trainloss/reward": 1.1028249263763428,
"trainrewards/accuracies": 0.96875,
"trainrewards/chosen": 1.8671875,
"trainrewards/margins": 2.921875,
"trainrewards/rejected": -1.0625
},
{
"epoch": 0.63,
"grad_norm": 3.266438978334478,
"learning_rate": 1.852952387243698e-06,
"loss": 2.4134,
"step": 84,
"trainloss/critic_chosen": 1.0756388902664185,
"trainloss/critic_rejected": 1.1303694248199463,
"trainloss/reward": 1.0756388902664185,
"trainrewards/accuracies": 0.9687500596046448,
"trainrewards/chosen": 1.9140625,
"trainrewards/margins": 3.25,
"trainrewards/rejected": -1.328125
},
{
"epoch": 0.63,
"grad_norm": 2.386641393706194,
"learning_rate": 1.7899616382401935e-06,
"loss": 2.401,
"step": 85,
"trainloss/critic_chosen": 1.0511287450790405,
"trainloss/critic_rejected": 1.128703236579895,
"trainloss/reward": 1.0511287450790405,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.6015625,
"trainrewards/margins": 2.953125,
"trainrewards/rejected": -1.359375
},
{
"epoch": 0.64,
"grad_norm": 3.7161933000807403,
"learning_rate": 1.7274575140626318e-06,
"loss": 2.4732,
"step": 86,
"trainloss/critic_chosen": 1.0827696323394775,
"trainloss/critic_rejected": 1.1439146995544434,
"trainloss/reward": 1.0827696323394775,
"trainrewards/accuracies": 0.9583333134651184,
"trainrewards/chosen": 1.0,
"trainrewards/margins": 2.734375,
"trainrewards/rejected": -1.734375
},
{
"epoch": 0.65,
"grad_norm": 3.4186216283012754,
"learning_rate": 1.665482851915573e-06,
"loss": 2.5064,
"step": 87,
"trainloss/critic_chosen": 1.093652367591858,
"trainloss/critic_rejected": 1.1373913288116455,
"trainloss/reward": 1.093652367591858,
"trainrewards/accuracies": 0.927083432674408,
"trainrewards/chosen": 1.09375,
"trainrewards/margins": 2.5625,
"trainrewards/rejected": -1.46875
},
{
"epoch": 0.66,
"grad_norm": 2.4263959266567996,
"learning_rate": 1.6040801261367494e-06,
"loss": 2.5409,
"step": 88,
"trainloss/critic_chosen": 1.1319228410720825,
"trainloss/critic_rejected": 1.1887366771697998,
"trainloss/reward": 1.1319228410720825,
"trainrewards/accuracies": 0.9687501192092896,
"trainrewards/chosen": 1.3125,
"trainrewards/margins": 2.6875,
"trainrewards/rejected": -1.375
},
{
"epoch": 0.66,
"grad_norm": 4.091003192293857,
"learning_rate": 1.5432914190872757e-06,
"loss": 2.5386,
"step": 89,
"trainloss/critic_chosen": 1.1037051677703857,
"trainloss/critic_rejected": 1.1342533826828003,
"trainloss/reward": 1.1037051677703857,
"trainrewards/accuracies": 0.9427083134651184,
"trainrewards/chosen": 1.640625,
"trainrewards/margins": 2.375,
"trainrewards/rejected": -0.734375
},
{
"epoch": 0.67,
"grad_norm": 4.356596196020246,
"learning_rate": 1.4831583923105e-06,
"loss": 2.4845,
"step": 90,
"trainloss/critic_chosen": 1.0889127254486084,
"trainloss/critic_rejected": 1.1599314212799072,
"trainloss/reward": 1.0889127254486084,
"trainrewards/accuracies": 0.9583333134651184,
"trainrewards/chosen": 1.875,
"trainrewards/margins": 2.59375,
"trainrewards/rejected": -0.7265625
},
{
"epoch": 0.68,
"grad_norm": 3.484859150407605,
"learning_rate": 1.4237222579792618e-06,
"loss": 2.504,
"step": 91,
"trainloss/critic_chosen": 1.1031081676483154,
"trainloss/critic_rejected": 1.1596983671188354,
"trainloss/reward": 1.1031081676483154,
"trainrewards/accuracies": 0.953125,
"trainrewards/chosen": 1.7265625,
"trainrewards/margins": 2.5,
"trainrewards/rejected": -0.765625
},
{
"epoch": 0.69,
"grad_norm": 3.5906077474254046,
"learning_rate": 1.3650237506511333e-06,
"loss": 2.497,
"step": 92,
"trainloss/critic_chosen": 1.1017568111419678,
"trainloss/critic_rejected": 1.1597734689712524,
"trainloss/reward": 1.1017568111419678,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.734375,
"trainrewards/margins": 2.609375,
"trainrewards/rejected": -0.87109375
},
{
"epoch": 0.69,
"grad_norm": 3.883326754801315,
"learning_rate": 1.307103099350979e-06,
"loss": 2.4881,
"step": 93,
"trainloss/critic_chosen": 1.1008602380752563,
"trainloss/critic_rejected": 1.1622505187988281,
"trainloss/reward": 1.1008602380752563,
"trainrewards/accuracies": 0.9374999403953552,
"trainrewards/chosen": 1.8359375,
"trainrewards/margins": 2.65625,
"trainrewards/rejected": -0.81640625
},
{
"epoch": 0.7,
"grad_norm": 3.106807473961497,
"learning_rate": 1.2500000000000007e-06,
"loss": 2.5239,
"step": 94,
"trainloss/critic_chosen": 1.1186132431030273,
"trainloss/critic_rejected": 1.1955691576004028,
"trainloss/reward": 1.1186132431030273,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.4453125,
"trainrewards/margins": 2.78125,
"trainrewards/rejected": -1.34375
},
{
"epoch": 0.71,
"grad_norm": 3.0694983477589237,
"learning_rate": 1.193753588210128e-06,
"loss": 2.4975,
"step": 95,
"trainloss/critic_chosen": 1.089274287223816,
"trainloss/critic_rejected": 1.1611120700836182,
"trainloss/reward": 1.089274287223816,
"trainrewards/accuracies": 0.9166667461395264,
"trainrewards/chosen": 1.21875,
"trainrewards/margins": 2.625,
"trainrewards/rejected": -1.4140625
},
{
"epoch": 0.72,
"grad_norm": 2.647849041797858,
"learning_rate": 1.1384024124624324e-06,
"loss": 2.4533,
"step": 96,
"trainloss/critic_chosen": 1.0731533765792847,
"trainloss/critic_rejected": 1.1588420867919922,
"trainloss/reward": 1.0731533765792847,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.2578125,
"trainrewards/margins": 2.671875,
"trainrewards/rejected": -1.40625
},
{
"epoch": 0.72,
"grad_norm": 3.0284092019206743,
"learning_rate": 1.0839844076879186e-06,
"loss": 2.52,
"step": 97,
"trainloss/critic_chosen": 1.1046061515808105,
"trainloss/critic_rejected": 1.1355366706848145,
"trainloss/reward": 1.1046061515808105,
"trainrewards/accuracies": 0.9114583134651184,
"trainrewards/chosen": 1.5234375,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -1.0
},
{
"epoch": 0.73,
"grad_norm": 3.239930376341791,
"learning_rate": 1.0305368692688175e-06,
"loss": 2.3829,
"step": 98,
"trainloss/critic_chosen": 1.0649518966674805,
"trainloss/critic_rejected": 1.1148179769515991,
"trainloss/reward": 1.0649518966674805,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.828125,
"trainrewards/margins": 2.921875,
"trainrewards/rejected": -1.0859375
},
{
"epoch": 0.74,
"grad_norm": 2.844057334430093,
"learning_rate": 9.780964274781984e-07,
"loss": 2.4761,
"step": 99,
"trainloss/critic_chosen": 1.0876730680465698,
"trainloss/critic_rejected": 1.1597809791564941,
"trainloss/reward": 1.0876730680465698,
"trainrewards/accuracies": 0.9583333134651184,
"trainrewards/chosen": 1.65625,
"trainrewards/margins": 2.703125,
"trainrewards/rejected": -1.046875
},
{
"epoch": 0.74,
"grad_norm": 2.523374395571222,
"learning_rate": 9.266990223754069e-07,
"loss": 2.4511,
"step": 100,
"trainloss/critic_chosen": 1.0983867645263672,
"trainloss/critic_rejected": 1.1455085277557373,
"trainloss/reward": 1.0983867645263672,
"trainrewards/accuracies": 0.9791666865348816,
"trainrewards/chosen": 1.5546875,
"trainrewards/margins": 2.78125,
"trainrewards/rejected": -1.21875
},
{
"epoch": 0.75,
"grad_norm": 3.3175745581436917,
"learning_rate": 8.763798791745413e-07,
"loss": 2.453,
"step": 101,
"trainloss/critic_chosen": 1.094862699508667,
"trainloss/critic_rejected": 1.1401726007461548,
"trainloss/reward": 1.094862699508667,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.625,
"trainrewards/margins": 2.78125,
"trainrewards/rejected": -1.15625
},
{
"epoch": 0.76,
"grad_norm": 2.8366252126004596,
"learning_rate": 8.271734841028553e-07,
"loss": 2.5483,
"step": 102,
"trainloss/critic_chosen": 1.0930631160736084,
"trainloss/critic_rejected": 1.173164963722229,
"trainloss/reward": 1.0930631160736084,
"trainrewards/accuracies": 0.8958333134651184,
"trainrewards/chosen": 1.390625,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -1.125
},
{
"epoch": 0.77,
"grad_norm": 2.9707853433568023,
"learning_rate": 7.791135607656147e-07,
"loss": 2.3986,
"step": 103,
"trainloss/critic_chosen": 1.0701719522476196,
"trainloss/critic_rejected": 1.1288470029830933,
"trainloss/reward": 1.0701719522476196,
"trainrewards/accuracies": 0.9791667461395264,
"trainrewards/chosen": 1.6328125,
"trainrewards/margins": 2.765625,
"trainrewards/rejected": -1.1328125
},
{
"epoch": 0.77,
"grad_norm": 3.309757754883857,
"learning_rate": 7.322330470336314e-07,
"loss": 2.429,
"step": 104,
"trainloss/critic_chosen": 1.0845046043395996,
"trainloss/critic_rejected": 1.1261361837387085,
"trainloss/reward": 1.0845046043395996,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.6796875,
"trainrewards/margins": 2.71875,
"trainrewards/rejected": -1.0234375
},
{
"epoch": 0.78,
"grad_norm": 2.4159819079210556,
"learning_rate": 6.865640724692815e-07,
"loss": 2.3868,
"step": 105,
"trainloss/critic_chosen": 1.0498684644699097,
"trainloss/critic_rejected": 1.131639003753662,
"trainloss/reward": 1.0498684644699097,
"trainrewards/accuracies": 0.9687500596046448,
"trainrewards/chosen": 1.5,
"trainrewards/margins": 2.90625,
"trainrewards/rejected": -1.3984375
},
{
"epoch": 0.79,
"grad_norm": 2.630877225161229,
"learning_rate": 6.421379363065142e-07,
"loss": 2.4745,
"step": 106,
"trainloss/critic_chosen": 1.0781538486480713,
"trainloss/critic_rejected": 1.1649324893951416,
"trainloss/reward": 1.0781538486480713,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.5,
"trainrewards/margins": 2.78125,
"trainrewards/rejected": -1.28125
},
{
"epoch": 0.8,
"grad_norm": 2.71300394057213,
"learning_rate": 5.989850859999227e-07,
"loss": 2.4433,
"step": 107,
"trainloss/critic_chosen": 1.0875132083892822,
"trainloss/critic_rejected": 1.1300991773605347,
"trainloss/reward": 1.0875132083892822,
"trainrewards/accuracies": 0.9635416865348816,
"trainrewards/chosen": 1.4140625,
"trainrewards/margins": 3.109375,
"trainrewards/rejected": -1.703125
},
{
"epoch": 0.8,
"grad_norm": 2.722489376200587,
"learning_rate": 5.571350963575728e-07,
"loss": 2.467,
"step": 108,
"trainloss/critic_chosen": 1.0709630250930786,
"trainloss/critic_rejected": 1.154178500175476,
"trainloss/reward": 1.0709630250930786,
"trainrewards/accuracies": 0.9479166865348816,
"trainrewards/chosen": 1.359375,
"trainrewards/margins": 2.859375,
"trainrewards/rejected": -1.5
},
{
"epoch": 0.81,
"grad_norm": 3.255161744830997,
"learning_rate": 5.166166492719124e-07,
"loss": 2.4854,
"step": 109,
"trainloss/critic_chosen": 1.1081310510635376,
"trainloss/critic_rejected": 1.152534008026123,
"trainloss/reward": 1.1081310510635376,
"trainrewards/accuracies": 0.973958432674408,
"trainrewards/chosen": 1.34375,
"trainrewards/margins": 2.96875,
"trainrewards/rejected": -1.6328125
},
{
"epoch": 0.82,
"grad_norm": 2.762498507683836,
"learning_rate": 4.774575140626317e-07,
"loss": 2.4388,
"step": 110,
"trainloss/critic_chosen": 1.065203070640564,
"trainloss/critic_rejected": 1.0969582796096802,
"trainloss/reward": 1.065203070640564,
"trainrewards/accuracies": 0.9635416269302368,
"trainrewards/chosen": 1.3046875,
"trainrewards/margins": 2.609375,
"trainrewards/rejected": -1.3125
},
{
"epoch": 0.83,
"grad_norm": 2.780757216314426,
"learning_rate": 4.396845284449608e-07,
"loss": 2.4319,
"step": 111,
"trainloss/critic_chosen": 1.083713173866272,
"trainloss/critic_rejected": 1.119750738143921,
"trainloss/reward": 1.083713173866272,
"trainrewards/accuracies": 0.9687500596046448,
"trainrewards/chosen": 1.7421875,
"trainrewards/margins": 3.03125,
"trainrewards/rejected": -1.296875
},
{
"epoch": 0.83,
"grad_norm": 3.7107544004289323,
"learning_rate": 4.033235801364402e-07,
"loss": 2.4846,
"step": 112,
"trainloss/critic_chosen": 1.106475830078125,
"trainloss/critic_rejected": 1.1211233139038086,
"trainloss/reward": 1.106475830078125,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.7421875,
"trainrewards/margins": 2.703125,
"trainrewards/rejected": -0.96484375
},
{
"epoch": 0.84,
"grad_norm": 3.3073751739512787,
"learning_rate": 3.683995891147696e-07,
"loss": 2.4629,
"step": 113,
"trainloss/critic_chosen": 1.0521959066390991,
"trainloss/critic_rejected": 1.173767328262329,
"trainloss/reward": 1.0521959066390991,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.828125,
"trainrewards/margins": 2.921875,
"trainrewards/rejected": -1.0859375
},
{
"epoch": 0.85,
"grad_norm": 2.99774406823753,
"learning_rate": 3.3493649053890325e-07,
"loss": 2.536,
"step": 114,
"trainloss/critic_chosen": 1.1110682487487793,
"trainloss/critic_rejected": 1.155356526374817,
"trainloss/reward": 1.1110682487487793,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 1.5859375,
"trainrewards/margins": 2.75,
"trainrewards/rejected": -1.1640625
},
{
"epoch": 0.86,
"grad_norm": 3.513330205624271,
"learning_rate": 3.0295721834508686e-07,
"loss": 2.4707,
"step": 115,
"trainloss/critic_chosen": 1.0783016681671143,
"trainloss/critic_rejected": 1.1236062049865723,
"trainloss/reward": 1.0783016681671143,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.703125,
"trainrewards/margins": 2.671875,
"trainrewards/rejected": -0.97265625
},
{
"epoch": 0.86,
"grad_norm": 2.800803642232422,
"learning_rate": 2.7248368952908055e-07,
"loss": 2.4803,
"step": 116,
"trainloss/critic_chosen": 1.080200433731079,
"trainloss/critic_rejected": 1.1515780687332153,
"trainloss/reward": 1.080200433731079,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.5546875,
"trainrewards/margins": 2.5625,
"trainrewards/rejected": -1.0
},
{
"epoch": 0.87,
"grad_norm": 2.7889069352140585,
"learning_rate": 2.43536789125349e-07,
"loss": 2.4905,
"step": 117,
"trainloss/critic_chosen": 1.088797688484192,
"trainloss/critic_rejected": 1.1520254611968994,
"trainloss/reward": 1.088797688484192,
"trainrewards/accuracies": 0.9375,
"trainrewards/chosen": 1.5,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -1.0078125
},
{
"epoch": 0.88,
"grad_norm": 2.931939214492335,
"learning_rate": 2.1613635589349756e-07,
"loss": 2.3937,
"step": 118,
"trainloss/critic_chosen": 1.0556377172470093,
"trainloss/critic_rejected": 1.1278637647628784,
"trainloss/reward": 1.0556377172470093,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.4375,
"trainrewards/margins": 2.46875,
"trainrewards/rejected": -1.03125
},
{
"epoch": 0.89,
"grad_norm": 2.954190958849344,
"learning_rate": 1.9030116872178317e-07,
"loss": 2.418,
"step": 119,
"trainloss/critic_chosen": 1.0978080034255981,
"trainloss/critic_rejected": 1.1148779392242432,
"trainloss/reward": 1.0978080034255981,
"trainrewards/accuracies": 0.9687500596046448,
"trainrewards/chosen": 1.4453125,
"trainrewards/margins": 2.46875,
"trainrewards/rejected": -1.03125
},
{
"epoch": 0.89,
"grad_norm": 2.9483773523832353,
"learning_rate": 1.6604893375699594e-07,
"loss": 2.4694,
"step": 120,
"trainloss/critic_chosen": 1.1018942594528198,
"trainloss/critic_rejected": 1.1370322704315186,
"trainloss/reward": 1.1018942594528198,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.3515625,
"trainrewards/margins": 2.40625,
"trainrewards/rejected": -1.0546875
},
{
"epoch": 0.9,
"grad_norm": 2.9215978058037764,
"learning_rate": 1.4339627226955394e-07,
"loss": 2.4822,
"step": 121,
"trainloss/critic_chosen": 1.1052017211914062,
"trainloss/critic_rejected": 1.148177981376648,
"trainloss/reward": 1.1052017211914062,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.3515625,
"trainrewards/margins": 2.515625,
"trainrewards/rejected": -1.1640625
},
{
"epoch": 0.91,
"grad_norm": 2.8843923021301667,
"learning_rate": 1.223587092621162e-07,
"loss": 2.4942,
"step": 122,
"trainloss/critic_chosen": 1.0736005306243896,
"trainloss/critic_rejected": 1.168089509010315,
"trainloss/reward": 1.0736005306243896,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.3359375,
"trainrewards/margins": 2.328125,
"trainrewards/rejected": -0.99609375
},
{
"epoch": 0.92,
"grad_norm": 2.8724683106941193,
"learning_rate": 1.0295066282951738e-07,
"loss": 2.4881,
"step": 123,
"trainloss/critic_chosen": 1.09504234790802,
"trainloss/critic_rejected": 1.1352362632751465,
"trainloss/reward": 1.09504234790802,
"trainrewards/accuracies": 0.9322916865348816,
"trainrewards/chosen": 1.4375,
"trainrewards/margins": 2.3125,
"trainrewards/rejected": -0.87109375
},
{
"epoch": 0.92,
"grad_norm": 3.0064917280475045,
"learning_rate": 8.518543427732951e-08,
"loss": 2.5066,
"step": 124,
"trainloss/critic_chosen": 1.0965893268585205,
"trainloss/critic_rejected": 1.12990403175354,
"trainloss/reward": 1.0965893268585205,
"trainrewards/accuracies": 0.9166667461395264,
"trainrewards/chosen": 1.4375,
"trainrewards/margins": 2.3125,
"trainrewards/rejected": -0.8828125
},
{
"epoch": 0.93,
"grad_norm": 2.6882210161223425,
"learning_rate": 6.907519900580862e-08,
"loss": 2.3973,
"step": 125,
"trainloss/critic_chosen": 1.0724809169769287,
"trainloss/critic_rejected": 1.1239736080169678,
"trainloss/reward": 1.0724809169769287,
"trainrewards/accuracies": 0.9687500596046448,
"trainrewards/chosen": 1.546875,
"trainrewards/margins": 2.5625,
"trainrewards/rejected": -1.015625
},
{
"epoch": 0.94,
"grad_norm": 3.2130299463812233,
"learning_rate": 5.463099816548578e-08,
"loss": 2.4583,
"step": 126,
"trainloss/critic_chosen": 1.053167700767517,
"trainloss/critic_rejected": 1.1157554388046265,
"trainloss/reward": 1.053167700767517,
"trainrewards/accuracies": 0.9270833730697632,
"trainrewards/chosen": 1.390625,
"trainrewards/margins": 2.171875,
"trainrewards/rejected": -0.7890625
},
{
"epoch": 0.95,
"grad_norm": 2.5537231163007004,
"learning_rate": 4.186273109011374e-08,
"loss": 2.5432,
"step": 127,
"trainloss/critic_chosen": 1.1048160791397095,
"trainloss/critic_rejected": 1.1709802150726318,
"trainloss/reward": 1.1048160791397095,
"trainrewards/accuracies": 0.9270833134651184,
"trainrewards/chosen": 1.234375,
"trainrewards/margins": 2.296875,
"trainrewards/rejected": -1.0546875
},
{
"epoch": 0.95,
"grad_norm": 3.455571318987563,
"learning_rate": 3.077914851215585e-08,
"loss": 2.4356,
"step": 128,
"trainloss/critic_chosen": 1.0750889778137207,
"trainloss/critic_rejected": 1.1610357761383057,
"trainloss/reward": 1.0750889778137207,
"trainrewards/accuracies": 0.9635416865348816,
"trainrewards/chosen": 1.734375,
"trainrewards/margins": 2.625,
"trainrewards/rejected": -0.89453125
},
{
"epoch": 0.96,
"grad_norm": 3.095008653826813,
"learning_rate": 2.1387846565474047e-08,
"loss": 2.4339,
"step": 129,
"trainloss/critic_chosen": 1.0719571113586426,
"trainloss/critic_rejected": 1.1504600048065186,
"trainloss/reward": 1.0719571113586426,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.6328125,
"trainrewards/margins": 2.59375,
"trainrewards/rejected": -0.96484375
},
{
"epoch": 0.97,
"grad_norm": 3.014103190303491,
"learning_rate": 1.3695261579316776e-08,
"loss": 2.4359,
"step": 130,
"trainloss/critic_chosen": 1.0608913898468018,
"trainloss/critic_rejected": 1.1623928546905518,
"trainloss/reward": 1.0608913898468018,
"trainrewards/accuracies": 0.9791667461395264,
"trainrewards/chosen": 1.5078125,
"trainrewards/margins": 2.453125,
"trainrewards/rejected": -0.94921875
},
{
"epoch": 0.98,
"grad_norm": 3.5824668969231825,
"learning_rate": 7.70666566718009e-09,
"loss": 2.457,
"step": 131,
"trainloss/critic_chosen": 1.0644171237945557,
"trainloss/critic_rejected": 1.1539928913116455,
"trainloss/reward": 1.0644171237945557,
"trainrewards/accuracies": 0.9583333730697632,
"trainrewards/chosen": 1.578125,
"trainrewards/margins": 2.5,
"trainrewards/rejected": -0.91796875
},
{
"epoch": 0.98,
"grad_norm": 3.085175163660414,
"learning_rate": 3.4261631135654174e-09,
"loss": 2.4695,
"step": 132,
"trainloss/critic_chosen": 1.0733301639556885,
"trainloss/critic_rejected": 1.1289600133895874,
"trainloss/reward": 1.0733301639556885,
"trainrewards/accuracies": 0.9427083730697632,
"trainrewards/chosen": 1.484375,
"trainrewards/margins": 2.296875,
"trainrewards/rejected": -0.80859375
},
{
"epoch": 0.99,
"grad_norm": 2.6627177185366087,
"learning_rate": 8.566875611068503e-10,
"loss": 2.456,
"step": 133,
"trainloss/critic_chosen": 1.0969208478927612,
"trainloss/critic_rejected": 1.1649024486541748,
"trainloss/reward": 1.0969208478927612,
"trainrewards/accuracies": 0.96875,
"trainrewards/chosen": 1.4765625,
"trainrewards/margins": 2.59375,
"trainrewards/rejected": -1.1171875
},
{
"epoch": 1.0,
"grad_norm": 2.6690531080971973,
"learning_rate": 0.0,
"loss": 2.4519,
"step": 134,
"trainloss/critic_chosen": 1.090218186378479,
"trainloss/critic_rejected": 1.128463625907898,
"trainloss/reward": 1.090218186378479,
"trainrewards/accuracies": 0.9531250596046448,
"trainrewards/chosen": 1.5078125,
"trainrewards/margins": 2.53125,
"trainrewards/rejected": -1.0234375
},
{
"epoch": 1.0,
"step": 134,
"total_flos": 0.0,
"train_loss": 2.6233635464710976,
"train_runtime": 32287.388,
"train_samples_per_second": 0.799,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1.0,
"max_steps": 134,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}