|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9981378026070763, |
|
"eval_steps": 500, |
|
"global_step": 134, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 20.656896651231627, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 3.6136, |
|
"step": 1, |
|
"trainloss/critic_chosen": 1.459133505821228, |
|
"trainloss/critic_rejected": 1.468864917755127, |
|
"trainloss/reward": 1.459133505821228, |
|
"trainrewards/accuracies": 0.5833333134651184, |
|
"trainrewards/chosen": 0.3359375, |
|
"trainrewards/margins": 0.0308837890625, |
|
"trainrewards/rejected": 0.3046875 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 20.747827742934472, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 3.6381, |
|
"step": 2, |
|
"trainloss/critic_chosen": 1.4447739124298096, |
|
"trainloss/critic_rejected": 1.4999535083770752, |
|
"trainloss/reward": 1.4447739124298096, |
|
"trainrewards/accuracies": 0.5104166865348816, |
|
"trainrewards/chosen": 0.314453125, |
|
"trainrewards/margins": 0.01531982421875, |
|
"trainrewards/rejected": 0.298828125 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 19.429850511676147, |
|
"learning_rate": 1.0714285714285714e-06, |
|
"loss": 3.6713, |
|
"step": 3, |
|
"trainloss/critic_chosen": 1.4738179445266724, |
|
"trainloss/critic_rejected": 1.505049228668213, |
|
"trainloss/reward": 1.4738179445266724, |
|
"trainrewards/accuracies": 0.5364583134651184, |
|
"trainrewards/chosen": 0.302734375, |
|
"trainrewards/margins": 0.015869140625, |
|
"trainrewards/rejected": 0.287109375 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 19.353394477551088, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 3.6414, |
|
"step": 4, |
|
"trainloss/critic_chosen": 1.4593632221221924, |
|
"trainloss/critic_rejected": 1.4944238662719727, |
|
"trainloss/reward": 1.4593632221221924, |
|
"trainrewards/accuracies": 0.5572916865348816, |
|
"trainrewards/chosen": 0.357421875, |
|
"trainrewards/margins": 0.0286865234375, |
|
"trainrewards/rejected": 0.328125 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 19.314515274756275, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 3.6014, |
|
"step": 5, |
|
"trainloss/critic_chosen": 1.4528993368148804, |
|
"trainloss/critic_rejected": 1.5066261291503906, |
|
"trainloss/reward": 1.4528993368148804, |
|
"trainrewards/accuracies": 0.7760417461395264, |
|
"trainrewards/chosen": 0.494140625, |
|
"trainrewards/margins": 0.130859375, |
|
"trainrewards/rejected": 0.36328125 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 17.65488892011503, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 3.5132, |
|
"step": 6, |
|
"trainloss/critic_chosen": 1.4163868427276611, |
|
"trainloss/critic_rejected": 1.4701489210128784, |
|
"trainloss/reward": 1.4163868427276611, |
|
"trainrewards/accuracies": 0.7864583730697632, |
|
"trainrewards/chosen": 0.54296875, |
|
"trainrewards/margins": 0.169921875, |
|
"trainrewards/rejected": 0.373046875 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 14.922878013160528, |
|
"learning_rate": 2.5e-06, |
|
"loss": 3.4609, |
|
"step": 7, |
|
"trainloss/critic_chosen": 1.4025382995605469, |
|
"trainloss/critic_rejected": 1.4922353029251099, |
|
"trainloss/reward": 1.4025382995605469, |
|
"trainrewards/accuracies": 0.8281250596046448, |
|
"trainrewards/chosen": 0.9453125, |
|
"trainrewards/margins": 0.376953125, |
|
"trainrewards/rejected": 0.56640625 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 14.232594419094823, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 3.3368, |
|
"step": 8, |
|
"trainloss/critic_chosen": 1.3685592412948608, |
|
"trainloss/critic_rejected": 1.4283447265625, |
|
"trainloss/reward": 1.3685592412948608, |
|
"trainrewards/accuracies": 0.8645833730697632, |
|
"trainrewards/chosen": 1.0078125, |
|
"trainrewards/margins": 0.47265625, |
|
"trainrewards/rejected": 0.53515625 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 12.480541507697524, |
|
"learning_rate": 3.2142857142857147e-06, |
|
"loss": 3.0686, |
|
"step": 9, |
|
"trainloss/critic_chosen": 1.3232171535491943, |
|
"trainloss/critic_rejected": 1.3606585264205933, |
|
"trainloss/reward": 1.3232171535491943, |
|
"trainrewards/accuracies": 0.9114583134651184, |
|
"trainrewards/chosen": 1.671875, |
|
"trainrewards/margins": 1.3359375, |
|
"trainrewards/rejected": 0.333984375 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.25153148096521, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 3.0194, |
|
"step": 10, |
|
"trainloss/critic_chosen": 1.289527177810669, |
|
"trainloss/critic_rejected": 1.3465213775634766, |
|
"trainloss/reward": 1.289527177810669, |
|
"trainrewards/accuracies": 0.9010416865348816, |
|
"trainrewards/chosen": 1.3125, |
|
"trainrewards/margins": 1.4375, |
|
"trainrewards/rejected": -0.1220703125 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 15.573626174864643, |
|
"learning_rate": 3.928571428571429e-06, |
|
"loss": 2.9476, |
|
"step": 11, |
|
"trainloss/critic_chosen": 1.266632318496704, |
|
"trainloss/critic_rejected": 1.3359544277191162, |
|
"trainloss/reward": 1.266632318496704, |
|
"trainrewards/accuracies": 0.9010417461395264, |
|
"trainrewards/chosen": 0.208984375, |
|
"trainrewards/margins": 2.25, |
|
"trainrewards/rejected": -2.046875 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.798738351150583, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 2.9208, |
|
"step": 12, |
|
"trainloss/critic_chosen": 1.285549521446228, |
|
"trainloss/critic_rejected": 1.336460828781128, |
|
"trainloss/reward": 1.285549521446228, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 1.390625, |
|
"trainrewards/margins": 2.453125, |
|
"trainrewards/rejected": -1.0625 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 10.920314695532706, |
|
"learning_rate": 4.642857142857144e-06, |
|
"loss": 2.9737, |
|
"step": 13, |
|
"trainloss/critic_chosen": 1.3031879663467407, |
|
"trainloss/critic_rejected": 1.3538345098495483, |
|
"trainloss/reward": 1.3031879663467407, |
|
"trainrewards/accuracies": 0.9583333134651184, |
|
"trainrewards/chosen": 1.9453125, |
|
"trainrewards/margins": 2.203125, |
|
"trainrewards/rejected": -0.255859375 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.902411216600844, |
|
"learning_rate": 5e-06, |
|
"loss": 2.8568, |
|
"step": 14, |
|
"trainloss/critic_chosen": 1.2445811033248901, |
|
"trainloss/critic_rejected": 1.3195788860321045, |
|
"trainloss/reward": 1.2445811033248901, |
|
"trainrewards/accuracies": 0.90625, |
|
"trainrewards/chosen": 1.5078125, |
|
"trainrewards/margins": 2.09375, |
|
"trainrewards/rejected": -0.5859375 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.497405609399582, |
|
"learning_rate": 4.999143312438893e-06, |
|
"loss": 2.8485, |
|
"step": 15, |
|
"trainloss/critic_chosen": 1.2575112581253052, |
|
"trainloss/critic_rejected": 1.303661823272705, |
|
"trainloss/reward": 1.2575112581253052, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 0.55078125, |
|
"trainrewards/margins": 1.84375, |
|
"trainrewards/rejected": -1.2890625 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8.767557307270321, |
|
"learning_rate": 4.9965738368864345e-06, |
|
"loss": 2.8737, |
|
"step": 16, |
|
"trainloss/critic_chosen": 1.2387288808822632, |
|
"trainloss/critic_rejected": 1.3038721084594727, |
|
"trainloss/reward": 1.2387288808822632, |
|
"trainrewards/accuracies": 0.9010416865348816, |
|
"trainrewards/chosen": 1.578125, |
|
"trainrewards/margins": 2.59375, |
|
"trainrewards/rejected": -1.015625 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 8.749611939075479, |
|
"learning_rate": 4.992293334332821e-06, |
|
"loss": 2.8681, |
|
"step": 17, |
|
"trainloss/critic_chosen": 1.2373957633972168, |
|
"trainloss/critic_rejected": 1.302640676498413, |
|
"trainloss/reward": 1.2373957633972168, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.5859375, |
|
"trainrewards/margins": 2.203125, |
|
"trainrewards/rejected": -0.61328125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 8.484573172931489, |
|
"learning_rate": 4.986304738420684e-06, |
|
"loss": 2.8305, |
|
"step": 18, |
|
"trainloss/critic_chosen": 1.23964262008667, |
|
"trainloss/critic_rejected": 1.297165870666504, |
|
"trainloss/reward": 1.23964262008667, |
|
"trainrewards/accuracies": 0.9166666865348816, |
|
"trainrewards/chosen": 0.60546875, |
|
"trainrewards/margins": 1.765625, |
|
"trainrewards/rejected": -1.15625 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.43676917020596, |
|
"learning_rate": 4.978612153434527e-06, |
|
"loss": 2.7193, |
|
"step": 19, |
|
"trainloss/critic_chosen": 1.2180960178375244, |
|
"trainloss/critic_rejected": 1.2327347993850708, |
|
"trainloss/reward": 1.2180960178375244, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.5546875, |
|
"trainrewards/margins": 2.296875, |
|
"trainrewards/rejected": -0.7421875 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.651490595569425, |
|
"learning_rate": 4.9692208514878445e-06, |
|
"loss": 2.8528, |
|
"step": 20, |
|
"trainloss/critic_chosen": 1.2103852033615112, |
|
"trainloss/critic_rejected": 1.3035379648208618, |
|
"trainloss/reward": 1.2103852033615112, |
|
"trainrewards/accuracies": 0.9062500596046448, |
|
"trainrewards/chosen": 1.75, |
|
"trainrewards/margins": 2.6875, |
|
"trainrewards/rejected": -0.94921875 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.1975082682744524, |
|
"learning_rate": 4.958137268909887e-06, |
|
"loss": 2.7287, |
|
"step": 21, |
|
"trainloss/critic_chosen": 1.1857198476791382, |
|
"trainloss/critic_rejected": 1.2193048000335693, |
|
"trainloss/reward": 1.1857198476791382, |
|
"trainrewards/accuracies": 0.9114583730697632, |
|
"trainrewards/chosen": 1.4296875, |
|
"trainrewards/margins": 2.21875, |
|
"trainrewards/rejected": -0.7890625 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.466958288822169, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 2.7514, |
|
"step": 22, |
|
"trainloss/critic_chosen": 1.1868751049041748, |
|
"trainloss/critic_rejected": 1.256333827972412, |
|
"trainloss/reward": 1.1868751049041748, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 0.953125, |
|
"trainrewards/margins": 1.71875, |
|
"trainrewards/rejected": -0.76171875 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.795005616591591, |
|
"learning_rate": 4.930924800994192e-06, |
|
"loss": 2.7025, |
|
"step": 23, |
|
"trainloss/critic_chosen": 1.1841645240783691, |
|
"trainloss/critic_rejected": 1.2626478672027588, |
|
"trainloss/reward": 1.1841645240783691, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.0625, |
|
"trainrewards/margins": 2.09375, |
|
"trainrewards/rejected": -1.0390625 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.216599935232891, |
|
"learning_rate": 4.914814565722671e-06, |
|
"loss": 2.7024, |
|
"step": 24, |
|
"trainloss/critic_chosen": 1.1600149869918823, |
|
"trainloss/critic_rejected": 1.216670036315918, |
|
"trainloss/reward": 1.1600149869918823, |
|
"trainrewards/accuracies": 0.90625, |
|
"trainrewards/chosen": 1.953125, |
|
"trainrewards/margins": 2.53125, |
|
"trainrewards/rejected": -0.578125 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.574536341669933, |
|
"learning_rate": 4.897049337170483e-06, |
|
"loss": 2.6825, |
|
"step": 25, |
|
"trainloss/critic_chosen": 1.17496919631958, |
|
"trainloss/critic_rejected": 1.2430278062820435, |
|
"trainloss/reward": 1.17496919631958, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.84375, |
|
"trainrewards/margins": 2.71875, |
|
"trainrewards/rejected": -0.87109375 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 8.130831144336229, |
|
"learning_rate": 4.8776412907378845e-06, |
|
"loss": 2.7403, |
|
"step": 26, |
|
"trainloss/critic_chosen": 1.1843974590301514, |
|
"trainloss/critic_rejected": 1.2316250801086426, |
|
"trainloss/reward": 1.1843974590301514, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 0.322265625, |
|
"trainrewards/margins": 2.09375, |
|
"trainrewards/rejected": -1.78125 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.106462749941039, |
|
"learning_rate": 4.856603727730446e-06, |
|
"loss": 2.6318, |
|
"step": 27, |
|
"trainloss/critic_chosen": 1.1325989961624146, |
|
"trainloss/critic_rejected": 1.2125966548919678, |
|
"trainloss/reward": 1.1325989961624146, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 0.98828125, |
|
"trainrewards/margins": 1.859375, |
|
"trainrewards/rejected": -0.875 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.501840024960186, |
|
"learning_rate": 4.833951066243004e-06, |
|
"loss": 2.7439, |
|
"step": 28, |
|
"trainloss/critic_chosen": 1.156808853149414, |
|
"trainloss/critic_rejected": 1.218095064163208, |
|
"trainloss/reward": 1.156808853149414, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 2.03125, |
|
"trainrewards/margins": 2.0, |
|
"trainrewards/rejected": 0.021240234375 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 10.542645887143404, |
|
"learning_rate": 4.809698831278217e-06, |
|
"loss": 2.6949, |
|
"step": 29, |
|
"trainloss/critic_chosen": 1.146854043006897, |
|
"trainloss/critic_rejected": 1.2151950597763062, |
|
"trainloss/reward": 1.146854043006897, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 2.546875, |
|
"trainrewards/margins": 2.375, |
|
"trainrewards/rejected": 0.169921875 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.112451478263716, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 2.6784, |
|
"step": 30, |
|
"trainloss/critic_chosen": 1.1554011106491089, |
|
"trainloss/critic_rejected": 1.2330732345581055, |
|
"trainloss/reward": 1.1554011106491089, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.6875, |
|
"trainrewards/margins": 2.390625, |
|
"trainrewards/rejected": -0.703125 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.406168864359725, |
|
"learning_rate": 4.7564632108746524e-06, |
|
"loss": 2.716, |
|
"step": 31, |
|
"trainloss/critic_chosen": 1.162062168121338, |
|
"trainloss/critic_rejected": 1.2383043766021729, |
|
"trainloss/reward": 1.162062168121338, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 0.609375, |
|
"trainrewards/margins": 1.9765625, |
|
"trainrewards/rejected": -1.3671875 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.067065444400827, |
|
"learning_rate": 4.72751631047092e-06, |
|
"loss": 2.6516, |
|
"step": 32, |
|
"trainloss/critic_chosen": 1.1599905490875244, |
|
"trainloss/critic_rejected": 1.2166763544082642, |
|
"trainloss/reward": 1.1599905490875244, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 0.7265625, |
|
"trainrewards/margins": 2.21875, |
|
"trainrewards/rejected": -1.4921875 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.759001147106114, |
|
"learning_rate": 4.697042781654913e-06, |
|
"loss": 2.6586, |
|
"step": 33, |
|
"trainloss/critic_chosen": 1.1513490676879883, |
|
"trainloss/critic_rejected": 1.1805285215377808, |
|
"trainloss/reward": 1.1513490676879883, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.8203125, |
|
"trainrewards/margins": 2.234375, |
|
"trainrewards/rejected": -0.408203125 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.671545596305826, |
|
"learning_rate": 4.665063509461098e-06, |
|
"loss": 2.6397, |
|
"step": 34, |
|
"trainloss/critic_chosen": 1.1355525255203247, |
|
"trainloss/critic_rejected": 1.1824406385421753, |
|
"trainloss/reward": 1.1355525255203247, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 2.15625, |
|
"trainrewards/margins": 2.34375, |
|
"trainrewards/rejected": -0.1923828125 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.120967770831028, |
|
"learning_rate": 4.631600410885231e-06, |
|
"loss": 2.6941, |
|
"step": 35, |
|
"trainloss/critic_chosen": 1.1876243352890015, |
|
"trainloss/critic_rejected": 1.2462928295135498, |
|
"trainloss/reward": 1.1876243352890015, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.6640625, |
|
"trainrewards/margins": 2.453125, |
|
"trainrewards/rejected": -0.78125 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.873851901547121, |
|
"learning_rate": 4.596676419863561e-06, |
|
"loss": 2.5644, |
|
"step": 36, |
|
"trainloss/critic_chosen": 1.1080451011657715, |
|
"trainloss/critic_rejected": 1.1967337131500244, |
|
"trainloss/reward": 1.1080451011657715, |
|
"trainrewards/accuracies": 0.96875, |
|
"trainrewards/chosen": 0.80078125, |
|
"trainrewards/margins": 2.125, |
|
"trainrewards/rejected": -1.328125 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.9620697117121004, |
|
"learning_rate": 4.560315471555039e-06, |
|
"loss": 2.5956, |
|
"step": 37, |
|
"trainloss/critic_chosen": 1.1373480558395386, |
|
"trainloss/critic_rejected": 1.2142869234085083, |
|
"trainloss/reward": 1.1373480558395386, |
|
"trainrewards/accuracies": 0.9375000596046448, |
|
"trainrewards/chosen": 1.0234375, |
|
"trainrewards/margins": 2.40625, |
|
"trainrewards/rejected": -1.390625 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.768390511994523, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 2.6888, |
|
"step": 38, |
|
"trainloss/critic_chosen": 1.1469529867172241, |
|
"trainloss/critic_rejected": 1.2045722007751465, |
|
"trainloss/reward": 1.1469529867172241, |
|
"trainrewards/accuracies": 0.9114583730697632, |
|
"trainrewards/chosen": 1.8046875, |
|
"trainrewards/margins": 2.484375, |
|
"trainrewards/rejected": -0.6875 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.874609149891752, |
|
"learning_rate": 4.4833833507280884e-06, |
|
"loss": 2.5543, |
|
"step": 39, |
|
"trainloss/critic_chosen": 1.1098886728286743, |
|
"trainloss/critic_rejected": 1.1714903116226196, |
|
"trainloss/reward": 1.1098886728286743, |
|
"trainrewards/accuracies": 0.958333432674408, |
|
"trainrewards/chosen": 1.8984375, |
|
"trainrewards/margins": 2.625, |
|
"trainrewards/rejected": -0.7265625 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.396284173013532, |
|
"learning_rate": 4.442864903642428e-06, |
|
"loss": 2.6564, |
|
"step": 40, |
|
"trainloss/critic_chosen": 1.1380069255828857, |
|
"trainloss/critic_rejected": 1.2159972190856934, |
|
"trainloss/reward": 1.1380069255828857, |
|
"trainrewards/accuracies": 0.9427083134651184, |
|
"trainrewards/chosen": 0.9296875, |
|
"trainrewards/margins": 1.9375, |
|
"trainrewards/rejected": -1.0078125 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.8092153086439087, |
|
"learning_rate": 4.401014914000078e-06, |
|
"loss": 2.5515, |
|
"step": 41, |
|
"trainloss/critic_chosen": 1.123491883277893, |
|
"trainloss/critic_rejected": 1.1983730792999268, |
|
"trainloss/reward": 1.123491883277893, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.015625, |
|
"trainrewards/margins": 2.125, |
|
"trainrewards/rejected": -1.1015625 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.292224209377405, |
|
"learning_rate": 4.357862063693486e-06, |
|
"loss": 2.6296, |
|
"step": 42, |
|
"trainloss/critic_chosen": 1.1296896934509277, |
|
"trainloss/critic_rejected": 1.193892002105713, |
|
"trainloss/reward": 1.1296896934509277, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.3671875, |
|
"trainrewards/margins": 2.234375, |
|
"trainrewards/rejected": -0.859375 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.97304962284229, |
|
"learning_rate": 4.313435927530719e-06, |
|
"loss": 2.5984, |
|
"step": 43, |
|
"trainloss/critic_chosen": 1.106866478919983, |
|
"trainloss/critic_rejected": 1.1839522123336792, |
|
"trainloss/reward": 1.106866478919983, |
|
"trainrewards/accuracies": 0.9166666865348816, |
|
"trainrewards/chosen": 1.859375, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -0.6640625 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.1437931542625615, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 2.6053, |
|
"step": 44, |
|
"trainloss/critic_chosen": 1.141026496887207, |
|
"trainloss/critic_rejected": 1.1881659030914307, |
|
"trainloss/reward": 1.141026496887207, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.484375, |
|
"trainrewards/margins": 2.5, |
|
"trainrewards/rejected": -1.015625 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.0260425195721727, |
|
"learning_rate": 4.220886439234385e-06, |
|
"loss": 2.6162, |
|
"step": 45, |
|
"trainloss/critic_chosen": 1.1437909603118896, |
|
"trainloss/critic_rejected": 1.1694350242614746, |
|
"trainloss/reward": 1.1437909603118896, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.3359375, |
|
"trainrewards/margins": 2.265625, |
|
"trainrewards/rejected": -0.93359375 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.9421991947992803, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 2.559, |
|
"step": 46, |
|
"trainloss/critic_chosen": 1.1193464994430542, |
|
"trainloss/critic_rejected": 1.1624045372009277, |
|
"trainloss/reward": 1.1193464994430542, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.2109375, |
|
"trainrewards/margins": 1.96875, |
|
"trainrewards/rejected": -0.75 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.76800798471375, |
|
"learning_rate": 4.123620120825459e-06, |
|
"loss": 2.5633, |
|
"step": 47, |
|
"trainloss/critic_chosen": 1.1039447784423828, |
|
"trainloss/critic_rejected": 1.1683855056762695, |
|
"trainloss/reward": 1.1039447784423828, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.5, |
|
"trainrewards/margins": 1.8515625, |
|
"trainrewards/rejected": -0.357421875 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.677899041279874, |
|
"learning_rate": 4.073300977624594e-06, |
|
"loss": 2.6104, |
|
"step": 48, |
|
"trainloss/critic_chosen": 1.1293267011642456, |
|
"trainloss/critic_rejected": 1.173600435256958, |
|
"trainloss/reward": 1.1293267011642456, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.5625, |
|
"trainrewards/margins": 1.953125, |
|
"trainrewards/rejected": -0.38671875 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.8973280324668815, |
|
"learning_rate": 4.021903572521802e-06, |
|
"loss": 2.5884, |
|
"step": 49, |
|
"trainloss/critic_chosen": 1.1289738416671753, |
|
"trainloss/critic_rejected": 1.169731855392456, |
|
"trainloss/reward": 1.1289738416671753, |
|
"trainrewards/accuracies": 0.9375000596046448, |
|
"trainrewards/chosen": 1.3125, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -1.203125 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.9211772383175685, |
|
"learning_rate": 3.969463130731183e-06, |
|
"loss": 2.5868, |
|
"step": 50, |
|
"trainloss/critic_chosen": 1.1367411613464355, |
|
"trainloss/critic_rejected": 1.1725157499313354, |
|
"trainloss/reward": 1.1367411613464355, |
|
"trainrewards/accuracies": 0.9114583730697632, |
|
"trainrewards/chosen": 1.265625, |
|
"trainrewards/margins": 2.484375, |
|
"trainrewards/rejected": -1.21875 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.4239805909008627, |
|
"learning_rate": 3.916015592312083e-06, |
|
"loss": 2.5442, |
|
"step": 51, |
|
"trainloss/critic_chosen": 1.101191759109497, |
|
"trainloss/critic_rejected": 1.2063257694244385, |
|
"trainloss/reward": 1.101191759109497, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.6171875, |
|
"trainrewards/margins": 2.546875, |
|
"trainrewards/rejected": -0.9296875 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.3449791279382155, |
|
"learning_rate": 3.861597587537568e-06, |
|
"loss": 2.5532, |
|
"step": 52, |
|
"trainloss/critic_chosen": 1.1064534187316895, |
|
"trainloss/critic_rejected": 1.1979490518569946, |
|
"trainloss/reward": 1.1064534187316895, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.6015625, |
|
"trainrewards/margins": 2.46875, |
|
"trainrewards/rejected": -0.875 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.9419484082989724, |
|
"learning_rate": 3.806246411789872e-06, |
|
"loss": 2.6147, |
|
"step": 53, |
|
"trainloss/critic_chosen": 1.138405680656433, |
|
"trainloss/critic_rejected": 1.1973538398742676, |
|
"trainloss/reward": 1.138405680656433, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.25, |
|
"trainrewards/margins": 2.609375, |
|
"trainrewards/rejected": -1.3671875 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.4980254593155413, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 2.5364, |
|
"step": 54, |
|
"trainloss/critic_chosen": 1.0845965147018433, |
|
"trainloss/critic_rejected": 1.1989306211471558, |
|
"trainloss/reward": 1.0845965147018433, |
|
"trainrewards/accuracies": 0.9635417461395264, |
|
"trainrewards/chosen": 1.6953125, |
|
"trainrewards/margins": 2.59375, |
|
"trainrewards/rejected": -0.89453125 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.7684432316267347, |
|
"learning_rate": 3.6928969006490212e-06, |
|
"loss": 2.5578, |
|
"step": 55, |
|
"trainloss/critic_chosen": 1.105364441871643, |
|
"trainloss/critic_rejected": 1.1862692832946777, |
|
"trainloss/reward": 1.105364441871643, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 1.8046875, |
|
"trainrewards/margins": 2.65625, |
|
"trainrewards/rejected": -0.86328125 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.733004985886796, |
|
"learning_rate": 3.634976249348867e-06, |
|
"loss": 2.5665, |
|
"step": 56, |
|
"trainloss/critic_chosen": 1.1256849765777588, |
|
"trainloss/critic_rejected": 1.1650742292404175, |
|
"trainloss/reward": 1.1256849765777588, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.328125, |
|
"trainrewards/margins": 2.390625, |
|
"trainrewards/rejected": -1.0546875 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.063205301802556, |
|
"learning_rate": 3.5762777420207382e-06, |
|
"loss": 2.5733, |
|
"step": 57, |
|
"trainloss/critic_chosen": 1.1022924184799194, |
|
"trainloss/critic_rejected": 1.1559257507324219, |
|
"trainloss/reward": 1.1022924184799194, |
|
"trainrewards/accuracies": 0.9166666865348816, |
|
"trainrewards/chosen": 1.40625, |
|
"trainrewards/margins": 2.28125, |
|
"trainrewards/rejected": -0.875 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.2936675397250985, |
|
"learning_rate": 3.516841607689501e-06, |
|
"loss": 2.529, |
|
"step": 58, |
|
"trainloss/critic_chosen": 1.0919924974441528, |
|
"trainloss/critic_rejected": 1.1807957887649536, |
|
"trainloss/reward": 1.0919924974441528, |
|
"trainrewards/accuracies": 0.9375000596046448, |
|
"trainrewards/chosen": 1.0703125, |
|
"trainrewards/margins": 2.0625, |
|
"trainrewards/rejected": -1.0 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.9687788874925505, |
|
"learning_rate": 3.4567085809127247e-06, |
|
"loss": 2.5538, |
|
"step": 59, |
|
"trainloss/critic_chosen": 1.152530312538147, |
|
"trainloss/critic_rejected": 1.128198504447937, |
|
"trainloss/reward": 1.152530312538147, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.3125, |
|
"trainrewards/margins": 2.171875, |
|
"trainrewards/rejected": -0.86328125 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.5189366946202374, |
|
"learning_rate": 3.39591987386325e-06, |
|
"loss": 2.4931, |
|
"step": 60, |
|
"trainloss/critic_chosen": 1.0971665382385254, |
|
"trainloss/critic_rejected": 1.189927339553833, |
|
"trainloss/reward": 1.0971665382385254, |
|
"trainrewards/accuracies": 0.96875, |
|
"trainrewards/chosen": 1.3828125, |
|
"trainrewards/margins": 2.671875, |
|
"trainrewards/rejected": -1.2890625 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.707774798123127, |
|
"learning_rate": 3.3345171480844275e-06, |
|
"loss": 2.4995, |
|
"step": 61, |
|
"trainloss/critic_chosen": 1.1144541501998901, |
|
"trainloss/critic_rejected": 1.1472208499908447, |
|
"trainloss/reward": 1.1144541501998901, |
|
"trainrewards/accuracies": 0.9739583730697632, |
|
"trainrewards/chosen": 1.9921875, |
|
"trainrewards/margins": 2.765625, |
|
"trainrewards/rejected": -0.7734375 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.621977923726089, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 2.5767, |
|
"step": 62, |
|
"trainloss/critic_chosen": 1.1388683319091797, |
|
"trainloss/critic_rejected": 1.1852062940597534, |
|
"trainloss/reward": 1.1388683319091797, |
|
"trainrewards/accuracies": 0.9479167461395264, |
|
"trainrewards/chosen": 1.8203125, |
|
"trainrewards/margins": 3.09375, |
|
"trainrewards/rejected": -1.265625 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.340502288849219, |
|
"learning_rate": 3.2100383617598075e-06, |
|
"loss": 2.5008, |
|
"step": 63, |
|
"trainloss/critic_chosen": 1.0960522890090942, |
|
"trainloss/critic_rejected": 1.1389869451522827, |
|
"trainloss/reward": 1.0960522890090942, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.25, |
|
"trainrewards/margins": 2.8125, |
|
"trainrewards/rejected": -1.5703125 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.2652013602478087, |
|
"learning_rate": 3.147047612756302e-06, |
|
"loss": 2.4784, |
|
"step": 64, |
|
"trainloss/critic_chosen": 1.1066646575927734, |
|
"trainloss/critic_rejected": 1.1423835754394531, |
|
"trainloss/reward": 1.1066646575927734, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.2265625, |
|
"trainrewards/margins": 2.859375, |
|
"trainrewards/rejected": -1.6328125 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.460312181878758, |
|
"learning_rate": 3.0836134096397642e-06, |
|
"loss": 2.5315, |
|
"step": 65, |
|
"trainloss/critic_chosen": 1.097680926322937, |
|
"trainloss/critic_rejected": 1.1829330921173096, |
|
"trainloss/reward": 1.097680926322937, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.71875, |
|
"trainrewards/margins": 2.375, |
|
"trainrewards/rejected": -0.66015625 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 5.398290397831798, |
|
"learning_rate": 3.019779227044398e-06, |
|
"loss": 2.4912, |
|
"step": 66, |
|
"trainloss/critic_chosen": 1.0728169679641724, |
|
"trainloss/critic_rejected": 1.1528609991073608, |
|
"trainloss/reward": 1.0728169679641724, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.75, |
|
"trainrewards/margins": 2.1875, |
|
"trainrewards/rejected": -0.44140625 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.530365049006353, |
|
"learning_rate": 2.9555888137303695e-06, |
|
"loss": 2.4768, |
|
"step": 67, |
|
"trainloss/critic_chosen": 1.0978233814239502, |
|
"trainloss/critic_rejected": 1.1454182863235474, |
|
"trainloss/reward": 1.0978233814239502, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.515625, |
|
"trainrewards/margins": 2.1875, |
|
"trainrewards/rejected": -0.66015625 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.090064735833262, |
|
"learning_rate": 2.8910861626005774e-06, |
|
"loss": 2.5542, |
|
"step": 68, |
|
"trainloss/critic_chosen": 1.1045993566513062, |
|
"trainloss/critic_rejected": 1.1823933124542236, |
|
"trainloss/reward": 1.1045993566513062, |
|
"trainrewards/accuracies": 0.9166666865348816, |
|
"trainrewards/chosen": 1.296875, |
|
"trainrewards/margins": 2.296875, |
|
"trainrewards/rejected": -1.0 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.801294778929006, |
|
"learning_rate": 2.82631548055013e-06, |
|
"loss": 2.4752, |
|
"step": 69, |
|
"trainloss/critic_chosen": 1.0862737894058228, |
|
"trainloss/critic_rejected": 1.1638906002044678, |
|
"trainloss/reward": 1.0862737894058228, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.46875, |
|
"trainrewards/margins": 2.8125, |
|
"trainrewards/rejected": -1.359375 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.5888770327583503, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 2.5502, |
|
"step": 70, |
|
"trainloss/critic_chosen": 1.1130059957504272, |
|
"trainloss/critic_rejected": 1.1747164726257324, |
|
"trainloss/reward": 1.1130059957504272, |
|
"trainrewards/accuracies": 0.9583333134651184, |
|
"trainrewards/chosen": 1.75, |
|
"trainrewards/margins": 2.953125, |
|
"trainrewards/rejected": -1.203125 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.553005435624982, |
|
"learning_rate": 2.696147739319613e-06, |
|
"loss": 2.4735, |
|
"step": 71, |
|
"trainloss/critic_chosen": 1.1133400201797485, |
|
"trainloss/critic_rejected": 1.1409944295883179, |
|
"trainloss/reward": 1.1133400201797485, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.96875, |
|
"trainrewards/margins": 3.375, |
|
"trainrewards/rejected": -1.40625 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.7088469336528145, |
|
"learning_rate": 2.6308398906073603e-06, |
|
"loss": 2.4512, |
|
"step": 72, |
|
"trainloss/critic_chosen": 1.1119564771652222, |
|
"trainloss/critic_rejected": 1.1244186162948608, |
|
"trainloss/reward": 1.1119564771652222, |
|
"trainrewards/accuracies": 0.96875, |
|
"trainrewards/chosen": 1.5703125, |
|
"trainrewards/margins": 3.03125, |
|
"trainrewards/rejected": -1.4609375 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.938561115333166, |
|
"learning_rate": 2.5654423707696834e-06, |
|
"loss": 2.4921, |
|
"step": 73, |
|
"trainloss/critic_chosen": 1.0844348669052124, |
|
"trainloss/critic_rejected": 1.163710355758667, |
|
"trainloss/reward": 1.0844348669052124, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.0703125, |
|
"trainrewards/margins": 2.734375, |
|
"trainrewards/rejected": -1.6640625 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.7076560975513293, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.4702, |
|
"step": 74, |
|
"trainloss/critic_chosen": 1.105428695678711, |
|
"trainloss/critic_rejected": 1.1134750843048096, |
|
"trainloss/reward": 1.105428695678711, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.1015625, |
|
"trainrewards/margins": 2.4375, |
|
"trainrewards/rejected": -1.328125 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.584325275815331, |
|
"learning_rate": 2.434557629230318e-06, |
|
"loss": 2.5531, |
|
"step": 75, |
|
"trainloss/critic_chosen": 1.1023496389389038, |
|
"trainloss/critic_rejected": 1.1693300008773804, |
|
"trainloss/reward": 1.1023496389389038, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.6953125, |
|
"trainrewards/margins": 2.265625, |
|
"trainrewards/rejected": -0.5703125 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.707921133643401, |
|
"learning_rate": 2.3691601093926406e-06, |
|
"loss": 2.512, |
|
"step": 76, |
|
"trainloss/critic_chosen": 1.0742218494415283, |
|
"trainloss/critic_rejected": 1.1473562717437744, |
|
"trainloss/reward": 1.0742218494415283, |
|
"trainrewards/accuracies": 0.9375000596046448, |
|
"trainrewards/chosen": 1.984375, |
|
"trainrewards/margins": 2.359375, |
|
"trainrewards/rejected": -0.380859375 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.052893345106084, |
|
"learning_rate": 2.3038522606803882e-06, |
|
"loss": 2.5495, |
|
"step": 77, |
|
"trainloss/critic_chosen": 1.09754478931427, |
|
"trainloss/critic_rejected": 1.175227165222168, |
|
"trainloss/reward": 1.09754478931427, |
|
"trainrewards/accuracies": 0.9218751192092896, |
|
"trainrewards/chosen": 1.8671875, |
|
"trainrewards/margins": 2.359375, |
|
"trainrewards/rejected": -0.490234375 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.505818483136781, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 2.5073, |
|
"step": 78, |
|
"trainloss/critic_chosen": 1.100816249847412, |
|
"trainloss/critic_rejected": 1.1553771495819092, |
|
"trainloss/reward": 1.100816249847412, |
|
"trainrewards/accuracies": 0.9375000596046448, |
|
"trainrewards/chosen": 1.4375, |
|
"trainrewards/margins": 2.1875, |
|
"trainrewards/rejected": -0.75 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.2251215117971475, |
|
"learning_rate": 2.173684519449872e-06, |
|
"loss": 2.5035, |
|
"step": 79, |
|
"trainloss/critic_chosen": 1.093074083328247, |
|
"trainloss/critic_rejected": 1.163825511932373, |
|
"trainloss/reward": 1.093074083328247, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 0.91796875, |
|
"trainrewards/margins": 2.140625, |
|
"trainrewards/rejected": -1.21875 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.171916933286059, |
|
"learning_rate": 2.1089138373994226e-06, |
|
"loss": 2.4726, |
|
"step": 80, |
|
"trainloss/critic_chosen": 1.0706841945648193, |
|
"trainloss/critic_rejected": 1.160952091217041, |
|
"trainloss/reward": 1.0706841945648193, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 0.98046875, |
|
"trainrewards/margins": 2.375, |
|
"trainrewards/rejected": -1.390625 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.7690433360924085, |
|
"learning_rate": 2.0444111862696313e-06, |
|
"loss": 2.4269, |
|
"step": 81, |
|
"trainloss/critic_chosen": 1.0752573013305664, |
|
"trainloss/critic_rejected": 1.1339901685714722, |
|
"trainloss/reward": 1.0752573013305664, |
|
"trainrewards/accuracies": 0.9739583730697632, |
|
"trainrewards/chosen": 1.484375, |
|
"trainrewards/margins": 2.578125, |
|
"trainrewards/rejected": -1.09375 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.358268001716196, |
|
"learning_rate": 1.9802207729556023e-06, |
|
"loss": 2.461, |
|
"step": 82, |
|
"trainloss/critic_chosen": 1.1075457334518433, |
|
"trainloss/critic_rejected": 1.1157523393630981, |
|
"trainloss/reward": 1.1075457334518433, |
|
"trainrewards/accuracies": 0.953125, |
|
"trainrewards/chosen": 1.8828125, |
|
"trainrewards/margins": 2.90625, |
|
"trainrewards/rejected": -1.0234375 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.328068525423629, |
|
"learning_rate": 1.9163865903602374e-06, |
|
"loss": 2.5352, |
|
"step": 83, |
|
"trainloss/critic_chosen": 1.1028249263763428, |
|
"trainloss/critic_rejected": 1.1644842624664307, |
|
"trainloss/reward": 1.1028249263763428, |
|
"trainrewards/accuracies": 0.96875, |
|
"trainrewards/chosen": 1.8671875, |
|
"trainrewards/margins": 2.921875, |
|
"trainrewards/rejected": -1.0625 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.266438978334478, |
|
"learning_rate": 1.852952387243698e-06, |
|
"loss": 2.4134, |
|
"step": 84, |
|
"trainloss/critic_chosen": 1.0756388902664185, |
|
"trainloss/critic_rejected": 1.1303694248199463, |
|
"trainloss/reward": 1.0756388902664185, |
|
"trainrewards/accuracies": 0.9687500596046448, |
|
"trainrewards/chosen": 1.9140625, |
|
"trainrewards/margins": 3.25, |
|
"trainrewards/rejected": -1.328125 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.386641393706194, |
|
"learning_rate": 1.7899616382401935e-06, |
|
"loss": 2.401, |
|
"step": 85, |
|
"trainloss/critic_chosen": 1.0511287450790405, |
|
"trainloss/critic_rejected": 1.128703236579895, |
|
"trainloss/reward": 1.0511287450790405, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.6015625, |
|
"trainrewards/margins": 2.953125, |
|
"trainrewards/rejected": -1.359375 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.7161933000807403, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 2.4732, |
|
"step": 86, |
|
"trainloss/critic_chosen": 1.0827696323394775, |
|
"trainloss/critic_rejected": 1.1439146995544434, |
|
"trainloss/reward": 1.0827696323394775, |
|
"trainrewards/accuracies": 0.9583333134651184, |
|
"trainrewards/chosen": 1.0, |
|
"trainrewards/margins": 2.734375, |
|
"trainrewards/rejected": -1.734375 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.4186216283012754, |
|
"learning_rate": 1.665482851915573e-06, |
|
"loss": 2.5064, |
|
"step": 87, |
|
"trainloss/critic_chosen": 1.093652367591858, |
|
"trainloss/critic_rejected": 1.1373913288116455, |
|
"trainloss/reward": 1.093652367591858, |
|
"trainrewards/accuracies": 0.927083432674408, |
|
"trainrewards/chosen": 1.09375, |
|
"trainrewards/margins": 2.5625, |
|
"trainrewards/rejected": -1.46875 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.4263959266567996, |
|
"learning_rate": 1.6040801261367494e-06, |
|
"loss": 2.5409, |
|
"step": 88, |
|
"trainloss/critic_chosen": 1.1319228410720825, |
|
"trainloss/critic_rejected": 1.1887366771697998, |
|
"trainloss/reward": 1.1319228410720825, |
|
"trainrewards/accuracies": 0.9687501192092896, |
|
"trainrewards/chosen": 1.3125, |
|
"trainrewards/margins": 2.6875, |
|
"trainrewards/rejected": -1.375 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.091003192293857, |
|
"learning_rate": 1.5432914190872757e-06, |
|
"loss": 2.5386, |
|
"step": 89, |
|
"trainloss/critic_chosen": 1.1037051677703857, |
|
"trainloss/critic_rejected": 1.1342533826828003, |
|
"trainloss/reward": 1.1037051677703857, |
|
"trainrewards/accuracies": 0.9427083134651184, |
|
"trainrewards/chosen": 1.640625, |
|
"trainrewards/margins": 2.375, |
|
"trainrewards/rejected": -0.734375 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.356596196020246, |
|
"learning_rate": 1.4831583923105e-06, |
|
"loss": 2.4845, |
|
"step": 90, |
|
"trainloss/critic_chosen": 1.0889127254486084, |
|
"trainloss/critic_rejected": 1.1599314212799072, |
|
"trainloss/reward": 1.0889127254486084, |
|
"trainrewards/accuracies": 0.9583333134651184, |
|
"trainrewards/chosen": 1.875, |
|
"trainrewards/margins": 2.59375, |
|
"trainrewards/rejected": -0.7265625 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.484859150407605, |
|
"learning_rate": 1.4237222579792618e-06, |
|
"loss": 2.504, |
|
"step": 91, |
|
"trainloss/critic_chosen": 1.1031081676483154, |
|
"trainloss/critic_rejected": 1.1596983671188354, |
|
"trainloss/reward": 1.1031081676483154, |
|
"trainrewards/accuracies": 0.953125, |
|
"trainrewards/chosen": 1.7265625, |
|
"trainrewards/margins": 2.5, |
|
"trainrewards/rejected": -0.765625 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.5906077474254046, |
|
"learning_rate": 1.3650237506511333e-06, |
|
"loss": 2.497, |
|
"step": 92, |
|
"trainloss/critic_chosen": 1.1017568111419678, |
|
"trainloss/critic_rejected": 1.1597734689712524, |
|
"trainloss/reward": 1.1017568111419678, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.734375, |
|
"trainrewards/margins": 2.609375, |
|
"trainrewards/rejected": -0.87109375 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.883326754801315, |
|
"learning_rate": 1.307103099350979e-06, |
|
"loss": 2.4881, |
|
"step": 93, |
|
"trainloss/critic_chosen": 1.1008602380752563, |
|
"trainloss/critic_rejected": 1.1622505187988281, |
|
"trainloss/reward": 1.1008602380752563, |
|
"trainrewards/accuracies": 0.9374999403953552, |
|
"trainrewards/chosen": 1.8359375, |
|
"trainrewards/margins": 2.65625, |
|
"trainrewards/rejected": -0.81640625 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.106807473961497, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 2.5239, |
|
"step": 94, |
|
"trainloss/critic_chosen": 1.1186132431030273, |
|
"trainloss/critic_rejected": 1.1955691576004028, |
|
"trainloss/reward": 1.1186132431030273, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.4453125, |
|
"trainrewards/margins": 2.78125, |
|
"trainrewards/rejected": -1.34375 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.0694983477589237, |
|
"learning_rate": 1.193753588210128e-06, |
|
"loss": 2.4975, |
|
"step": 95, |
|
"trainloss/critic_chosen": 1.089274287223816, |
|
"trainloss/critic_rejected": 1.1611120700836182, |
|
"trainloss/reward": 1.089274287223816, |
|
"trainrewards/accuracies": 0.9166667461395264, |
|
"trainrewards/chosen": 1.21875, |
|
"trainrewards/margins": 2.625, |
|
"trainrewards/rejected": -1.4140625 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.647849041797858, |
|
"learning_rate": 1.1384024124624324e-06, |
|
"loss": 2.4533, |
|
"step": 96, |
|
"trainloss/critic_chosen": 1.0731533765792847, |
|
"trainloss/critic_rejected": 1.1588420867919922, |
|
"trainloss/reward": 1.0731533765792847, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.2578125, |
|
"trainrewards/margins": 2.671875, |
|
"trainrewards/rejected": -1.40625 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.0284092019206743, |
|
"learning_rate": 1.0839844076879186e-06, |
|
"loss": 2.52, |
|
"step": 97, |
|
"trainloss/critic_chosen": 1.1046061515808105, |
|
"trainloss/critic_rejected": 1.1355366706848145, |
|
"trainloss/reward": 1.1046061515808105, |
|
"trainrewards/accuracies": 0.9114583134651184, |
|
"trainrewards/chosen": 1.5234375, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -1.0 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.239930376341791, |
|
"learning_rate": 1.0305368692688175e-06, |
|
"loss": 2.3829, |
|
"step": 98, |
|
"trainloss/critic_chosen": 1.0649518966674805, |
|
"trainloss/critic_rejected": 1.1148179769515991, |
|
"trainloss/reward": 1.0649518966674805, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.828125, |
|
"trainrewards/margins": 2.921875, |
|
"trainrewards/rejected": -1.0859375 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.844057334430093, |
|
"learning_rate": 9.780964274781984e-07, |
|
"loss": 2.4761, |
|
"step": 99, |
|
"trainloss/critic_chosen": 1.0876730680465698, |
|
"trainloss/critic_rejected": 1.1597809791564941, |
|
"trainloss/reward": 1.0876730680465698, |
|
"trainrewards/accuracies": 0.9583333134651184, |
|
"trainrewards/chosen": 1.65625, |
|
"trainrewards/margins": 2.703125, |
|
"trainrewards/rejected": -1.046875 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.523374395571222, |
|
"learning_rate": 9.266990223754069e-07, |
|
"loss": 2.4511, |
|
"step": 100, |
|
"trainloss/critic_chosen": 1.0983867645263672, |
|
"trainloss/critic_rejected": 1.1455085277557373, |
|
"trainloss/reward": 1.0983867645263672, |
|
"trainrewards/accuracies": 0.9791666865348816, |
|
"trainrewards/chosen": 1.5546875, |
|
"trainrewards/margins": 2.78125, |
|
"trainrewards/rejected": -1.21875 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.3175745581436917, |
|
"learning_rate": 8.763798791745413e-07, |
|
"loss": 2.453, |
|
"step": 101, |
|
"trainloss/critic_chosen": 1.094862699508667, |
|
"trainloss/critic_rejected": 1.1401726007461548, |
|
"trainloss/reward": 1.094862699508667, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.625, |
|
"trainrewards/margins": 2.78125, |
|
"trainrewards/rejected": -1.15625 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.8366252126004596, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 2.5483, |
|
"step": 102, |
|
"trainloss/critic_chosen": 1.0930631160736084, |
|
"trainloss/critic_rejected": 1.173164963722229, |
|
"trainloss/reward": 1.0930631160736084, |
|
"trainrewards/accuracies": 0.8958333134651184, |
|
"trainrewards/chosen": 1.390625, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -1.125 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.9707853433568023, |
|
"learning_rate": 7.791135607656147e-07, |
|
"loss": 2.3986, |
|
"step": 103, |
|
"trainloss/critic_chosen": 1.0701719522476196, |
|
"trainloss/critic_rejected": 1.1288470029830933, |
|
"trainloss/reward": 1.0701719522476196, |
|
"trainrewards/accuracies": 0.9791667461395264, |
|
"trainrewards/chosen": 1.6328125, |
|
"trainrewards/margins": 2.765625, |
|
"trainrewards/rejected": -1.1328125 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.309757754883857, |
|
"learning_rate": 7.322330470336314e-07, |
|
"loss": 2.429, |
|
"step": 104, |
|
"trainloss/critic_chosen": 1.0845046043395996, |
|
"trainloss/critic_rejected": 1.1261361837387085, |
|
"trainloss/reward": 1.0845046043395996, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.6796875, |
|
"trainrewards/margins": 2.71875, |
|
"trainrewards/rejected": -1.0234375 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.4159819079210556, |
|
"learning_rate": 6.865640724692815e-07, |
|
"loss": 2.3868, |
|
"step": 105, |
|
"trainloss/critic_chosen": 1.0498684644699097, |
|
"trainloss/critic_rejected": 1.131639003753662, |
|
"trainloss/reward": 1.0498684644699097, |
|
"trainrewards/accuracies": 0.9687500596046448, |
|
"trainrewards/chosen": 1.5, |
|
"trainrewards/margins": 2.90625, |
|
"trainrewards/rejected": -1.3984375 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.630877225161229, |
|
"learning_rate": 6.421379363065142e-07, |
|
"loss": 2.4745, |
|
"step": 106, |
|
"trainloss/critic_chosen": 1.0781538486480713, |
|
"trainloss/critic_rejected": 1.1649324893951416, |
|
"trainloss/reward": 1.0781538486480713, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.5, |
|
"trainrewards/margins": 2.78125, |
|
"trainrewards/rejected": -1.28125 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.71300394057213, |
|
"learning_rate": 5.989850859999227e-07, |
|
"loss": 2.4433, |
|
"step": 107, |
|
"trainloss/critic_chosen": 1.0875132083892822, |
|
"trainloss/critic_rejected": 1.1300991773605347, |
|
"trainloss/reward": 1.0875132083892822, |
|
"trainrewards/accuracies": 0.9635416865348816, |
|
"trainrewards/chosen": 1.4140625, |
|
"trainrewards/margins": 3.109375, |
|
"trainrewards/rejected": -1.703125 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.722489376200587, |
|
"learning_rate": 5.571350963575728e-07, |
|
"loss": 2.467, |
|
"step": 108, |
|
"trainloss/critic_chosen": 1.0709630250930786, |
|
"trainloss/critic_rejected": 1.154178500175476, |
|
"trainloss/reward": 1.0709630250930786, |
|
"trainrewards/accuracies": 0.9479166865348816, |
|
"trainrewards/chosen": 1.359375, |
|
"trainrewards/margins": 2.859375, |
|
"trainrewards/rejected": -1.5 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.255161744830997, |
|
"learning_rate": 5.166166492719124e-07, |
|
"loss": 2.4854, |
|
"step": 109, |
|
"trainloss/critic_chosen": 1.1081310510635376, |
|
"trainloss/critic_rejected": 1.152534008026123, |
|
"trainloss/reward": 1.1081310510635376, |
|
"trainrewards/accuracies": 0.973958432674408, |
|
"trainrewards/chosen": 1.34375, |
|
"trainrewards/margins": 2.96875, |
|
"trainrewards/rejected": -1.6328125 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.762498507683836, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 2.4388, |
|
"step": 110, |
|
"trainloss/critic_chosen": 1.065203070640564, |
|
"trainloss/critic_rejected": 1.0969582796096802, |
|
"trainloss/reward": 1.065203070640564, |
|
"trainrewards/accuracies": 0.9635416269302368, |
|
"trainrewards/chosen": 1.3046875, |
|
"trainrewards/margins": 2.609375, |
|
"trainrewards/rejected": -1.3125 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.780757216314426, |
|
"learning_rate": 4.396845284449608e-07, |
|
"loss": 2.4319, |
|
"step": 111, |
|
"trainloss/critic_chosen": 1.083713173866272, |
|
"trainloss/critic_rejected": 1.119750738143921, |
|
"trainloss/reward": 1.083713173866272, |
|
"trainrewards/accuracies": 0.9687500596046448, |
|
"trainrewards/chosen": 1.7421875, |
|
"trainrewards/margins": 3.03125, |
|
"trainrewards/rejected": -1.296875 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.7107544004289323, |
|
"learning_rate": 4.033235801364402e-07, |
|
"loss": 2.4846, |
|
"step": 112, |
|
"trainloss/critic_chosen": 1.106475830078125, |
|
"trainloss/critic_rejected": 1.1211233139038086, |
|
"trainloss/reward": 1.106475830078125, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.7421875, |
|
"trainrewards/margins": 2.703125, |
|
"trainrewards/rejected": -0.96484375 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.3073751739512787, |
|
"learning_rate": 3.683995891147696e-07, |
|
"loss": 2.4629, |
|
"step": 113, |
|
"trainloss/critic_chosen": 1.0521959066390991, |
|
"trainloss/critic_rejected": 1.173767328262329, |
|
"trainloss/reward": 1.0521959066390991, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.828125, |
|
"trainrewards/margins": 2.921875, |
|
"trainrewards/rejected": -1.0859375 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.99774406823753, |
|
"learning_rate": 3.3493649053890325e-07, |
|
"loss": 2.536, |
|
"step": 114, |
|
"trainloss/critic_chosen": 1.1110682487487793, |
|
"trainloss/critic_rejected": 1.155356526374817, |
|
"trainloss/reward": 1.1110682487487793, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 1.5859375, |
|
"trainrewards/margins": 2.75, |
|
"trainrewards/rejected": -1.1640625 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.513330205624271, |
|
"learning_rate": 3.0295721834508686e-07, |
|
"loss": 2.4707, |
|
"step": 115, |
|
"trainloss/critic_chosen": 1.0783016681671143, |
|
"trainloss/critic_rejected": 1.1236062049865723, |
|
"trainloss/reward": 1.0783016681671143, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.703125, |
|
"trainrewards/margins": 2.671875, |
|
"trainrewards/rejected": -0.97265625 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.800803642232422, |
|
"learning_rate": 2.7248368952908055e-07, |
|
"loss": 2.4803, |
|
"step": 116, |
|
"trainloss/critic_chosen": 1.080200433731079, |
|
"trainloss/critic_rejected": 1.1515780687332153, |
|
"trainloss/reward": 1.080200433731079, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.5546875, |
|
"trainrewards/margins": 2.5625, |
|
"trainrewards/rejected": -1.0 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.7889069352140585, |
|
"learning_rate": 2.43536789125349e-07, |
|
"loss": 2.4905, |
|
"step": 117, |
|
"trainloss/critic_chosen": 1.088797688484192, |
|
"trainloss/critic_rejected": 1.1520254611968994, |
|
"trainloss/reward": 1.088797688484192, |
|
"trainrewards/accuracies": 0.9375, |
|
"trainrewards/chosen": 1.5, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -1.0078125 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.931939214492335, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 2.3937, |
|
"step": 118, |
|
"trainloss/critic_chosen": 1.0556377172470093, |
|
"trainloss/critic_rejected": 1.1278637647628784, |
|
"trainloss/reward": 1.0556377172470093, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.4375, |
|
"trainrewards/margins": 2.46875, |
|
"trainrewards/rejected": -1.03125 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.954190958849344, |
|
"learning_rate": 1.9030116872178317e-07, |
|
"loss": 2.418, |
|
"step": 119, |
|
"trainloss/critic_chosen": 1.0978080034255981, |
|
"trainloss/critic_rejected": 1.1148779392242432, |
|
"trainloss/reward": 1.0978080034255981, |
|
"trainrewards/accuracies": 0.9687500596046448, |
|
"trainrewards/chosen": 1.4453125, |
|
"trainrewards/margins": 2.46875, |
|
"trainrewards/rejected": -1.03125 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.9483773523832353, |
|
"learning_rate": 1.6604893375699594e-07, |
|
"loss": 2.4694, |
|
"step": 120, |
|
"trainloss/critic_chosen": 1.1018942594528198, |
|
"trainloss/critic_rejected": 1.1370322704315186, |
|
"trainloss/reward": 1.1018942594528198, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.3515625, |
|
"trainrewards/margins": 2.40625, |
|
"trainrewards/rejected": -1.0546875 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.9215978058037764, |
|
"learning_rate": 1.4339627226955394e-07, |
|
"loss": 2.4822, |
|
"step": 121, |
|
"trainloss/critic_chosen": 1.1052017211914062, |
|
"trainloss/critic_rejected": 1.148177981376648, |
|
"trainloss/reward": 1.1052017211914062, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.3515625, |
|
"trainrewards/margins": 2.515625, |
|
"trainrewards/rejected": -1.1640625 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.8843923021301667, |
|
"learning_rate": 1.223587092621162e-07, |
|
"loss": 2.4942, |
|
"step": 122, |
|
"trainloss/critic_chosen": 1.0736005306243896, |
|
"trainloss/critic_rejected": 1.168089509010315, |
|
"trainloss/reward": 1.0736005306243896, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.3359375, |
|
"trainrewards/margins": 2.328125, |
|
"trainrewards/rejected": -0.99609375 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.8724683106941193, |
|
"learning_rate": 1.0295066282951738e-07, |
|
"loss": 2.4881, |
|
"step": 123, |
|
"trainloss/critic_chosen": 1.09504234790802, |
|
"trainloss/critic_rejected": 1.1352362632751465, |
|
"trainloss/reward": 1.09504234790802, |
|
"trainrewards/accuracies": 0.9322916865348816, |
|
"trainrewards/chosen": 1.4375, |
|
"trainrewards/margins": 2.3125, |
|
"trainrewards/rejected": -0.87109375 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.0064917280475045, |
|
"learning_rate": 8.518543427732951e-08, |
|
"loss": 2.5066, |
|
"step": 124, |
|
"trainloss/critic_chosen": 1.0965893268585205, |
|
"trainloss/critic_rejected": 1.12990403175354, |
|
"trainloss/reward": 1.0965893268585205, |
|
"trainrewards/accuracies": 0.9166667461395264, |
|
"trainrewards/chosen": 1.4375, |
|
"trainrewards/margins": 2.3125, |
|
"trainrewards/rejected": -0.8828125 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.6882210161223425, |
|
"learning_rate": 6.907519900580862e-08, |
|
"loss": 2.3973, |
|
"step": 125, |
|
"trainloss/critic_chosen": 1.0724809169769287, |
|
"trainloss/critic_rejected": 1.1239736080169678, |
|
"trainloss/reward": 1.0724809169769287, |
|
"trainrewards/accuracies": 0.9687500596046448, |
|
"trainrewards/chosen": 1.546875, |
|
"trainrewards/margins": 2.5625, |
|
"trainrewards/rejected": -1.015625 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.2130299463812233, |
|
"learning_rate": 5.463099816548578e-08, |
|
"loss": 2.4583, |
|
"step": 126, |
|
"trainloss/critic_chosen": 1.053167700767517, |
|
"trainloss/critic_rejected": 1.1157554388046265, |
|
"trainloss/reward": 1.053167700767517, |
|
"trainrewards/accuracies": 0.9270833730697632, |
|
"trainrewards/chosen": 1.390625, |
|
"trainrewards/margins": 2.171875, |
|
"trainrewards/rejected": -0.7890625 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.5537231163007004, |
|
"learning_rate": 4.186273109011374e-08, |
|
"loss": 2.5432, |
|
"step": 127, |
|
"trainloss/critic_chosen": 1.1048160791397095, |
|
"trainloss/critic_rejected": 1.1709802150726318, |
|
"trainloss/reward": 1.1048160791397095, |
|
"trainrewards/accuracies": 0.9270833134651184, |
|
"trainrewards/chosen": 1.234375, |
|
"trainrewards/margins": 2.296875, |
|
"trainrewards/rejected": -1.0546875 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.455571318987563, |
|
"learning_rate": 3.077914851215585e-08, |
|
"loss": 2.4356, |
|
"step": 128, |
|
"trainloss/critic_chosen": 1.0750889778137207, |
|
"trainloss/critic_rejected": 1.1610357761383057, |
|
"trainloss/reward": 1.0750889778137207, |
|
"trainrewards/accuracies": 0.9635416865348816, |
|
"trainrewards/chosen": 1.734375, |
|
"trainrewards/margins": 2.625, |
|
"trainrewards/rejected": -0.89453125 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.095008653826813, |
|
"learning_rate": 2.1387846565474047e-08, |
|
"loss": 2.4339, |
|
"step": 129, |
|
"trainloss/critic_chosen": 1.0719571113586426, |
|
"trainloss/critic_rejected": 1.1504600048065186, |
|
"trainloss/reward": 1.0719571113586426, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.6328125, |
|
"trainrewards/margins": 2.59375, |
|
"trainrewards/rejected": -0.96484375 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.014103190303491, |
|
"learning_rate": 1.3695261579316776e-08, |
|
"loss": 2.4359, |
|
"step": 130, |
|
"trainloss/critic_chosen": 1.0608913898468018, |
|
"trainloss/critic_rejected": 1.1623928546905518, |
|
"trainloss/reward": 1.0608913898468018, |
|
"trainrewards/accuracies": 0.9791667461395264, |
|
"trainrewards/chosen": 1.5078125, |
|
"trainrewards/margins": 2.453125, |
|
"trainrewards/rejected": -0.94921875 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.5824668969231825, |
|
"learning_rate": 7.70666566718009e-09, |
|
"loss": 2.457, |
|
"step": 131, |
|
"trainloss/critic_chosen": 1.0644171237945557, |
|
"trainloss/critic_rejected": 1.1539928913116455, |
|
"trainloss/reward": 1.0644171237945557, |
|
"trainrewards/accuracies": 0.9583333730697632, |
|
"trainrewards/chosen": 1.578125, |
|
"trainrewards/margins": 2.5, |
|
"trainrewards/rejected": -0.91796875 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.085175163660414, |
|
"learning_rate": 3.4261631135654174e-09, |
|
"loss": 2.4695, |
|
"step": 132, |
|
"trainloss/critic_chosen": 1.0733301639556885, |
|
"trainloss/critic_rejected": 1.1289600133895874, |
|
"trainloss/reward": 1.0733301639556885, |
|
"trainrewards/accuracies": 0.9427083730697632, |
|
"trainrewards/chosen": 1.484375, |
|
"trainrewards/margins": 2.296875, |
|
"trainrewards/rejected": -0.80859375 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.6627177185366087, |
|
"learning_rate": 8.566875611068503e-10, |
|
"loss": 2.456, |
|
"step": 133, |
|
"trainloss/critic_chosen": 1.0969208478927612, |
|
"trainloss/critic_rejected": 1.1649024486541748, |
|
"trainloss/reward": 1.0969208478927612, |
|
"trainrewards/accuracies": 0.96875, |
|
"trainrewards/chosen": 1.4765625, |
|
"trainrewards/margins": 2.59375, |
|
"trainrewards/rejected": -1.1171875 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.6690531080971973, |
|
"learning_rate": 0.0, |
|
"loss": 2.4519, |
|
"step": 134, |
|
"trainloss/critic_chosen": 1.090218186378479, |
|
"trainloss/critic_rejected": 1.128463625907898, |
|
"trainloss/reward": 1.090218186378479, |
|
"trainrewards/accuracies": 0.9531250596046448, |
|
"trainrewards/chosen": 1.5078125, |
|
"trainrewards/margins": 2.53125, |
|
"trainrewards/rejected": -1.0234375 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 134, |
|
"total_flos": 0.0, |
|
"train_loss": 2.6233635464710976, |
|
"train_runtime": 32287.388, |
|
"train_samples_per_second": 0.799, |
|
"train_steps_per_second": 0.004 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|