Headline-Generation-Model / trainer_state.json
HiGenius's picture
Upload model v2
d4b3dc3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1389521640091116,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028473804100227792,
"grad_norm": 1.8249249458312988,
"learning_rate": 1.4099732346241459e-05,
"logits/chosen": 1.5533103942871094,
"logits/rejected": 1.544719934463501,
"logps/chosen": -192.5115509033203,
"logps/rejected": -190.71209716796875,
"loss": 0.6934,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.008504397235810757,
"rewards/margins": -0.00039802552782930434,
"rewards/rejected": -0.008106371387839317,
"step": 10
},
{
"epoch": 0.0056947608200455585,
"grad_norm": 1.7557423114776611,
"learning_rate": 1.4099464692482917e-05,
"logits/chosen": 1.467641830444336,
"logits/rejected": 1.4499633312225342,
"logps/chosen": -186.05093383789062,
"logps/rejected": -189.79763793945312,
"loss": 0.6938,
"rewards/accuracies": 0.46666669845581055,
"rewards/chosen": -0.03809415176510811,
"rewards/margins": -0.001227277098223567,
"rewards/rejected": -0.03686687722802162,
"step": 20
},
{
"epoch": 0.008542141230068337,
"grad_norm": 1.97469162940979,
"learning_rate": 1.4099197038724375e-05,
"logits/chosen": 1.4973409175872803,
"logits/rejected": 1.481737494468689,
"logps/chosen": -192.17340087890625,
"logps/rejected": -190.8618927001953,
"loss": 0.692,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03541722521185875,
"rewards/margins": 0.0023386760149151087,
"rewards/rejected": -0.037755902856588364,
"step": 30
},
{
"epoch": 0.011389521640091117,
"grad_norm": 1.5228796005249023,
"learning_rate": 1.4098929384965832e-05,
"logits/chosen": 1.6990602016448975,
"logits/rejected": 1.6831896305084229,
"logps/chosen": -187.31143188476562,
"logps/rejected": -194.3468780517578,
"loss": 0.6878,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -0.02589474990963936,
"rewards/margins": 0.010891737416386604,
"rewards/rejected": -0.036786485463380814,
"step": 40
},
{
"epoch": 0.014236902050113895,
"grad_norm": 2.0425374507904053,
"learning_rate": 1.409866173120729e-05,
"logits/chosen": 1.6863610744476318,
"logits/rejected": 1.6478984355926514,
"logps/chosen": -191.1285400390625,
"logps/rejected": -185.4095916748047,
"loss": 0.6875,
"rewards/accuracies": 0.6666667461395264,
"rewards/chosen": -0.045168228447437286,
"rewards/margins": 0.011603166349232197,
"rewards/rejected": -0.05677139759063721,
"step": 50
},
{
"epoch": 0.017084282460136675,
"grad_norm": 1.708747386932373,
"learning_rate": 1.4098394077448748e-05,
"logits/chosen": 1.4254977703094482,
"logits/rejected": 1.4232938289642334,
"logps/chosen": -190.47293090820312,
"logps/rejected": -188.8648223876953,
"loss": 0.6886,
"rewards/accuracies": 0.5333333611488342,
"rewards/chosen": -0.05600341409444809,
"rewards/margins": 0.009662959724664688,
"rewards/rejected": -0.06566638499498367,
"step": 60
},
{
"epoch": 0.019931662870159454,
"grad_norm": 1.7994238138198853,
"learning_rate": 1.4098126423690206e-05,
"logits/chosen": 1.6648858785629272,
"logits/rejected": 1.6290760040283203,
"logps/chosen": -186.66439819335938,
"logps/rejected": -185.37510681152344,
"loss": 0.6807,
"rewards/accuracies": 0.6833332777023315,
"rewards/chosen": -0.062477756291627884,
"rewards/margins": 0.02577758952975273,
"rewards/rejected": -0.08825534582138062,
"step": 70
},
{
"epoch": 0.022779043280182234,
"grad_norm": 1.9590085744857788,
"learning_rate": 1.4097858769931664e-05,
"logits/chosen": 1.5724966526031494,
"logits/rejected": 1.552328109741211,
"logps/chosen": -194.67926025390625,
"logps/rejected": -189.94973754882812,
"loss": 0.6847,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": -0.05106516554951668,
"rewards/margins": 0.018698066473007202,
"rewards/rejected": -0.06976323574781418,
"step": 80
},
{
"epoch": 0.02562642369020501,
"grad_norm": 2.6948983669281006,
"learning_rate": 1.4097591116173122e-05,
"logits/chosen": 1.6500282287597656,
"logits/rejected": 1.6157087087631226,
"logps/chosen": -190.87289428710938,
"logps/rejected": -190.57659912109375,
"loss": 0.6812,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -0.06399134546518326,
"rewards/margins": 0.0265243761241436,
"rewards/rejected": -0.09051571786403656,
"step": 90
},
{
"epoch": 0.02847380410022779,
"grad_norm": 2.4923946857452393,
"learning_rate": 1.409732346241458e-05,
"logits/chosen": 1.762634515762329,
"logits/rejected": 1.7326021194458008,
"logps/chosen": -191.38101196289062,
"logps/rejected": -190.61634826660156,
"loss": 0.6778,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.05230847746133804,
"rewards/margins": 0.03380978852510452,
"rewards/rejected": -0.08611828088760376,
"step": 100
},
{
"epoch": 0.03132118451025057,
"grad_norm": 2.1932480335235596,
"learning_rate": 1.4097055808656037e-05,
"logits/chosen": 1.9199613332748413,
"logits/rejected": 1.8599956035614014,
"logps/chosen": -191.87869262695312,
"logps/rejected": -191.84231567382812,
"loss": 0.6757,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.05387324094772339,
"rewards/margins": 0.0382312536239624,
"rewards/rejected": -0.09210449457168579,
"step": 110
},
{
"epoch": 0.03416856492027335,
"grad_norm": 1.9617068767547607,
"learning_rate": 1.4096788154897494e-05,
"logits/chosen": 1.4400994777679443,
"logits/rejected": 1.4526021480560303,
"logps/chosen": -187.902587890625,
"logps/rejected": -185.8184814453125,
"loss": 0.6764,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07060353457927704,
"rewards/margins": 0.03810857608914375,
"rewards/rejected": -0.10871211439371109,
"step": 120
},
{
"epoch": 0.037015945330296125,
"grad_norm": 2.709176540374756,
"learning_rate": 1.4096520501138952e-05,
"logits/chosen": 1.8948795795440674,
"logits/rejected": 1.934361219406128,
"logps/chosen": -191.3181610107422,
"logps/rejected": -188.64439392089844,
"loss": 0.6868,
"rewards/accuracies": 0.5500000715255737,
"rewards/chosen": -0.0670461356639862,
"rewards/margins": 0.018613968044519424,
"rewards/rejected": -0.08566009998321533,
"step": 130
},
{
"epoch": 0.03986332574031891,
"grad_norm": 2.8838157653808594,
"learning_rate": 1.409625284738041e-05,
"logits/chosen": 1.595059871673584,
"logits/rejected": 1.593703031539917,
"logps/chosen": -182.03919982910156,
"logps/rejected": -188.1974334716797,
"loss": 0.6772,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": -0.02041524276137352,
"rewards/margins": 0.03836324065923691,
"rewards/rejected": -0.05877848342061043,
"step": 140
},
{
"epoch": 0.042710706150341685,
"grad_norm": 2.5347156524658203,
"learning_rate": 1.4095985193621868e-05,
"logits/chosen": 1.4492591619491577,
"logits/rejected": 1.465135097503662,
"logps/chosen": -186.55406188964844,
"logps/rejected": -185.12742614746094,
"loss": 0.6761,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.02626597322523594,
"rewards/margins": 0.04097798839211464,
"rewards/rejected": -0.014712016098201275,
"step": 150
},
{
"epoch": 0.04555808656036447,
"grad_norm": 2.7310001850128174,
"learning_rate": 1.4095717539863326e-05,
"logits/chosen": 1.1774822473526,
"logits/rejected": 1.1877602338790894,
"logps/chosen": -192.26759338378906,
"logps/rejected": -190.67376708984375,
"loss": 0.6481,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": 0.05477526783943176,
"rewards/margins": 0.0998917669057846,
"rewards/rejected": -0.04511650279164314,
"step": 160
},
{
"epoch": 0.048405466970387244,
"grad_norm": 2.826904058456421,
"learning_rate": 1.4095449886104784e-05,
"logits/chosen": 1.487045407295227,
"logits/rejected": 1.4947443008422852,
"logps/chosen": -188.93545532226562,
"logps/rejected": -191.91812133789062,
"loss": 0.6902,
"rewards/accuracies": 0.5166666507720947,
"rewards/chosen": 0.029497122392058372,
"rewards/margins": 0.014662249013781548,
"rewards/rejected": 0.014834875240921974,
"step": 170
},
{
"epoch": 0.05125284738041002,
"grad_norm": 3.4622280597686768,
"learning_rate": 1.4095182232346241e-05,
"logits/chosen": 1.340524673461914,
"logits/rejected": 1.3434597253799438,
"logps/chosen": -196.30982971191406,
"logps/rejected": -190.28746032714844,
"loss": 0.6557,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.07341782003641129,
"rewards/margins": 0.08739937841892242,
"rewards/rejected": -0.013981550931930542,
"step": 180
},
{
"epoch": 0.0541002277904328,
"grad_norm": 2.600072145462036,
"learning_rate": 1.4094914578587699e-05,
"logits/chosen": 1.1103225946426392,
"logits/rejected": 1.1387397050857544,
"logps/chosen": -188.13656616210938,
"logps/rejected": -183.8177947998047,
"loss": 0.6545,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.09875272214412689,
"rewards/margins": 0.08882729709148407,
"rewards/rejected": 0.009925423189997673,
"step": 190
},
{
"epoch": 0.05694760820045558,
"grad_norm": 2.8922410011291504,
"learning_rate": 1.4094646924829157e-05,
"logits/chosen": 1.510562539100647,
"logits/rejected": 1.513253092765808,
"logps/chosen": -189.52235412597656,
"logps/rejected": -187.32711791992188,
"loss": 0.6551,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": 0.14362266659736633,
"rewards/margins": 0.09014402329921722,
"rewards/rejected": 0.05347864702343941,
"step": 200
},
{
"epoch": 0.05979498861047836,
"grad_norm": 3.4067959785461426,
"learning_rate": 1.4094379271070615e-05,
"logits/chosen": 1.609612226486206,
"logits/rejected": 1.616186499595642,
"logps/chosen": -184.65316772460938,
"logps/rejected": -185.62258911132812,
"loss": 0.6634,
"rewards/accuracies": 0.6333333849906921,
"rewards/chosen": 0.21420073509216309,
"rewards/margins": 0.07762883603572845,
"rewards/rejected": 0.13657189905643463,
"step": 210
},
{
"epoch": 0.06264236902050115,
"grad_norm": 3.975015640258789,
"learning_rate": 1.4094111617312074e-05,
"logits/chosen": 1.4019300937652588,
"logits/rejected": 1.4394185543060303,
"logps/chosen": -184.24755859375,
"logps/rejected": -186.11117553710938,
"loss": 0.6454,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": 0.2533518075942993,
"rewards/margins": 0.11108176410198212,
"rewards/rejected": 0.142270028591156,
"step": 220
},
{
"epoch": 0.06548974943052392,
"grad_norm": 2.7093393802642822,
"learning_rate": 1.4093843963553532e-05,
"logits/chosen": 1.3297879695892334,
"logits/rejected": 1.3092052936553955,
"logps/chosen": -184.32113647460938,
"logps/rejected": -184.41651916503906,
"loss": 0.668,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.23246827721595764,
"rewards/margins": 0.06593836843967438,
"rewards/rejected": 0.16652987897396088,
"step": 230
},
{
"epoch": 0.0683371298405467,
"grad_norm": 3.239527702331543,
"learning_rate": 1.409357630979499e-05,
"logits/chosen": 1.6373841762542725,
"logits/rejected": 1.618748426437378,
"logps/chosen": -187.17330932617188,
"logps/rejected": -187.43460083007812,
"loss": 0.6351,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.3167666792869568,
"rewards/margins": 0.1365506947040558,
"rewards/rejected": 0.180215984582901,
"step": 240
},
{
"epoch": 0.07118451025056947,
"grad_norm": 3.3547203540802,
"learning_rate": 1.4093308656036446e-05,
"logits/chosen": 1.3807073831558228,
"logits/rejected": 1.3713890314102173,
"logps/chosen": -189.984130859375,
"logps/rejected": -194.66429138183594,
"loss": 0.6498,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 0.421330988407135,
"rewards/margins": 0.11505619436502457,
"rewards/rejected": 0.3062748312950134,
"step": 250
},
{
"epoch": 0.07403189066059225,
"grad_norm": 3.3612968921661377,
"learning_rate": 1.4093041002277905e-05,
"logits/chosen": 1.3529853820800781,
"logits/rejected": 1.3378267288208008,
"logps/chosen": -182.531494140625,
"logps/rejected": -180.93429565429688,
"loss": 0.6556,
"rewards/accuracies": 0.6166667342185974,
"rewards/chosen": 0.49432888627052307,
"rewards/margins": 0.09916864335536957,
"rewards/rejected": 0.3951602280139923,
"step": 260
},
{
"epoch": 0.07687927107061504,
"grad_norm": 5.357379913330078,
"learning_rate": 1.4092773348519363e-05,
"logits/chosen": 1.3663667440414429,
"logits/rejected": 1.3557411432266235,
"logps/chosen": -180.38389587402344,
"logps/rejected": -180.8890380859375,
"loss": 0.6403,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.45921817421913147,
"rewards/margins": 0.13937883079051971,
"rewards/rejected": 0.31983932852745056,
"step": 270
},
{
"epoch": 0.07972665148063782,
"grad_norm": 4.233500003814697,
"learning_rate": 1.4092505694760821e-05,
"logits/chosen": 1.3878666162490845,
"logits/rejected": 1.3849413394927979,
"logps/chosen": -187.7335205078125,
"logps/rejected": -187.17578125,
"loss": 0.6419,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": 0.5678433179855347,
"rewards/margins": 0.13482099771499634,
"rewards/rejected": 0.4330223500728607,
"step": 280
},
{
"epoch": 0.08257403189066059,
"grad_norm": 3.41872501373291,
"learning_rate": 1.4092238041002279e-05,
"logits/chosen": 1.4805208444595337,
"logits/rejected": 1.4516006708145142,
"logps/chosen": -182.12094116210938,
"logps/rejected": -184.3780975341797,
"loss": 0.6337,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": 0.7841524481773376,
"rewards/margins": 0.16332173347473145,
"rewards/rejected": 0.6208308339118958,
"step": 290
},
{
"epoch": 0.08542141230068337,
"grad_norm": 3.346146821975708,
"learning_rate": 1.4091970387243737e-05,
"logits/chosen": 1.4589000940322876,
"logits/rejected": 1.4556543827056885,
"logps/chosen": -182.80862426757812,
"logps/rejected": -183.60992431640625,
"loss": 0.6396,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.7094445824623108,
"rewards/margins": 0.14436820149421692,
"rewards/rejected": 0.5650763511657715,
"step": 300
},
{
"epoch": 0.08826879271070615,
"grad_norm": 4.612051963806152,
"learning_rate": 1.4091702733485195e-05,
"logits/chosen": 1.5287668704986572,
"logits/rejected": 1.5147442817687988,
"logps/chosen": -182.61416625976562,
"logps/rejected": -183.923828125,
"loss": 0.6683,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": 0.8587290644645691,
"rewards/margins": 0.10906902700662613,
"rewards/rejected": 0.7496601343154907,
"step": 310
},
{
"epoch": 0.09111617312072894,
"grad_norm": 3.749986171722412,
"learning_rate": 1.4091435079726652e-05,
"logits/chosen": 1.4341462850570679,
"logits/rejected": 1.408405065536499,
"logps/chosen": -183.7834014892578,
"logps/rejected": -184.10061645507812,
"loss": 0.6241,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.9308969378471375,
"rewards/margins": 0.1793207824230194,
"rewards/rejected": 0.75157630443573,
"step": 320
},
{
"epoch": 0.09396355353075171,
"grad_norm": 5.060494899749756,
"learning_rate": 1.409116742596811e-05,
"logits/chosen": 1.3190691471099854,
"logits/rejected": 1.2641618251800537,
"logps/chosen": -176.72817993164062,
"logps/rejected": -177.97030639648438,
"loss": 0.6306,
"rewards/accuracies": 0.6333333849906921,
"rewards/chosen": 0.9671937823295593,
"rewards/margins": 0.19174639880657196,
"rewards/rejected": 0.7754473686218262,
"step": 330
},
{
"epoch": 0.09681093394077449,
"grad_norm": 3.9803237915039062,
"learning_rate": 1.4090899772209567e-05,
"logits/chosen": 1.8298852443695068,
"logits/rejected": 1.8242895603179932,
"logps/chosen": -179.95370483398438,
"logps/rejected": -179.55535888671875,
"loss": 0.6068,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": 1.0600353479385376,
"rewards/margins": 0.23250079154968262,
"rewards/rejected": 0.827534556388855,
"step": 340
},
{
"epoch": 0.09965831435079726,
"grad_norm": 5.183294773101807,
"learning_rate": 1.4090632118451025e-05,
"logits/chosen": 1.5438239574432373,
"logits/rejected": 1.513770341873169,
"logps/chosen": -180.3792724609375,
"logps/rejected": -184.63238525390625,
"loss": 0.605,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": 1.0928313732147217,
"rewards/margins": 0.2398819476366043,
"rewards/rejected": 0.8529494404792786,
"step": 350
},
{
"epoch": 0.10250569476082004,
"grad_norm": 5.333198070526123,
"learning_rate": 1.4090364464692483e-05,
"logits/chosen": 1.2938203811645508,
"logits/rejected": 1.2762978076934814,
"logps/chosen": -182.16403198242188,
"logps/rejected": -181.60708618164062,
"loss": 0.6208,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": 0.9186900854110718,
"rewards/margins": 0.23468203842639923,
"rewards/rejected": 0.6840081214904785,
"step": 360
},
{
"epoch": 0.10535307517084283,
"grad_norm": 4.603104114532471,
"learning_rate": 1.4090096810933941e-05,
"logits/chosen": 1.5223296880722046,
"logits/rejected": 1.4895663261413574,
"logps/chosen": -179.9161376953125,
"logps/rejected": -187.1411590576172,
"loss": 0.5639,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.7743527293205261,
"rewards/margins": 0.34860119223594666,
"rewards/rejected": 0.42575159668922424,
"step": 370
},
{
"epoch": 0.1082004555808656,
"grad_norm": 8.414449691772461,
"learning_rate": 1.40898291571754e-05,
"logits/chosen": 1.5139826536178589,
"logits/rejected": 1.4941532611846924,
"logps/chosen": -180.5480499267578,
"logps/rejected": -183.39735412597656,
"loss": 0.5821,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.7373024225234985,
"rewards/margins": 0.30213838815689087,
"rewards/rejected": 0.4351639747619629,
"step": 380
},
{
"epoch": 0.11104783599088838,
"grad_norm": 4.939911842346191,
"learning_rate": 1.4089561503416856e-05,
"logits/chosen": 1.5266821384429932,
"logits/rejected": 1.5277129411697388,
"logps/chosen": -182.3626251220703,
"logps/rejected": -186.07952880859375,
"loss": 0.608,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.665320098400116,
"rewards/margins": 0.2625730037689209,
"rewards/rejected": 0.40274715423583984,
"step": 390
},
{
"epoch": 0.11389521640091116,
"grad_norm": 4.656453609466553,
"learning_rate": 1.4089293849658314e-05,
"logits/chosen": 1.84712815284729,
"logits/rejected": 1.8255043029785156,
"logps/chosen": -185.7529296875,
"logps/rejected": -191.48007202148438,
"loss": 0.5975,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.485964298248291,
"rewards/margins": 0.28932809829711914,
"rewards/rejected": 0.19663624465465546,
"step": 400
},
{
"epoch": 0.11674259681093394,
"grad_norm": 6.418772220611572,
"learning_rate": 1.4089026195899772e-05,
"logits/chosen": 1.6944414377212524,
"logits/rejected": 1.6806474924087524,
"logps/chosen": -188.72129821777344,
"logps/rejected": -187.45614624023438,
"loss": 0.618,
"rewards/accuracies": 0.6833332777023315,
"rewards/chosen": 0.6671693921089172,
"rewards/margins": 0.26440855860710144,
"rewards/rejected": 0.4027608036994934,
"step": 410
},
{
"epoch": 0.11958997722095673,
"grad_norm": 6.491544723510742,
"learning_rate": 1.408875854214123e-05,
"logits/chosen": 1.5915181636810303,
"logits/rejected": 1.5981369018554688,
"logps/chosen": -183.72654724121094,
"logps/rejected": -189.572021484375,
"loss": 0.5637,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": 0.8245267868041992,
"rewards/margins": 0.3756122887134552,
"rewards/rejected": 0.44891443848609924,
"step": 420
},
{
"epoch": 0.1224373576309795,
"grad_norm": 4.8121185302734375,
"learning_rate": 1.4088490888382688e-05,
"logits/chosen": 1.659536600112915,
"logits/rejected": 1.6513128280639648,
"logps/chosen": -187.01806640625,
"logps/rejected": -188.09231567382812,
"loss": 0.6012,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.8251700401306152,
"rewards/margins": 0.31730136275291443,
"rewards/rejected": 0.5078686475753784,
"step": 430
},
{
"epoch": 0.1252847380410023,
"grad_norm": 4.413024425506592,
"learning_rate": 1.4088223234624147e-05,
"logits/chosen": 1.6399879455566406,
"logits/rejected": 1.6434142589569092,
"logps/chosen": -184.6322021484375,
"logps/rejected": -184.0763397216797,
"loss": 0.6125,
"rewards/accuracies": 0.6333333849906921,
"rewards/chosen": 0.9141266942024231,
"rewards/margins": 0.2925638258457184,
"rewards/rejected": 0.6215628981590271,
"step": 440
},
{
"epoch": 0.12813211845102507,
"grad_norm": 4.256793975830078,
"learning_rate": 1.4087955580865605e-05,
"logits/chosen": 1.6364936828613281,
"logits/rejected": 1.613063097000122,
"logps/chosen": -187.21011352539062,
"logps/rejected": -185.93507385253906,
"loss": 0.5987,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": 0.6389889121055603,
"rewards/margins": 0.27906617522239685,
"rewards/rejected": 0.35992270708084106,
"step": 450
},
{
"epoch": 0.13097949886104784,
"grad_norm": 6.61959981918335,
"learning_rate": 1.4087687927107061e-05,
"logits/chosen": 1.768164038658142,
"logits/rejected": 1.7589298486709595,
"logps/chosen": -179.0304718017578,
"logps/rejected": -189.29208374023438,
"loss": 0.5597,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": 0.7509846687316895,
"rewards/margins": 0.4243010878562927,
"rewards/rejected": 0.32668358087539673,
"step": 460
},
{
"epoch": 0.13382687927107062,
"grad_norm": 3.841958522796631,
"learning_rate": 1.408742027334852e-05,
"logits/chosen": 1.8268096446990967,
"logits/rejected": 1.7784650325775146,
"logps/chosen": -180.85340881347656,
"logps/rejected": -188.13333129882812,
"loss": 0.6454,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 0.7165217399597168,
"rewards/margins": 0.24613836407661438,
"rewards/rejected": 0.47038334608078003,
"step": 470
},
{
"epoch": 0.1366742596810934,
"grad_norm": 4.740571022033691,
"learning_rate": 1.4087152619589978e-05,
"logits/chosen": 2.0922436714172363,
"logits/rejected": 2.054403066635132,
"logps/chosen": -177.0702667236328,
"logps/rejected": -182.60592651367188,
"loss": 0.6027,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": 0.6301138997077942,
"rewards/margins": 0.36010658740997314,
"rewards/rejected": 0.27000728249549866,
"step": 480
},
{
"epoch": 0.13952164009111617,
"grad_norm": 4.578260898590088,
"learning_rate": 1.4086884965831436e-05,
"logits/chosen": 1.6760629415512085,
"logits/rejected": 1.6328535079956055,
"logps/chosen": -179.05679321289062,
"logps/rejected": -185.8597412109375,
"loss": 0.5984,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": 0.885299026966095,
"rewards/margins": 0.3471626341342926,
"rewards/rejected": 0.53813636302948,
"step": 490
},
{
"epoch": 0.14236902050113895,
"grad_norm": 9.571253776550293,
"learning_rate": 1.4086617312072894e-05,
"logits/chosen": 1.8746957778930664,
"logits/rejected": 1.8733335733413696,
"logps/chosen": -181.01669311523438,
"logps/rejected": -188.90940856933594,
"loss": 0.5617,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.8897438049316406,
"rewards/margins": 0.44727665185928345,
"rewards/rejected": 0.44246721267700195,
"step": 500
},
{
"epoch": 0.14521640091116172,
"grad_norm": 4.612638473510742,
"learning_rate": 1.4086349658314352e-05,
"logits/chosen": 1.926553726196289,
"logits/rejected": 1.9168344736099243,
"logps/chosen": -179.99130249023438,
"logps/rejected": -183.90756225585938,
"loss": 0.5381,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": 0.7969551682472229,
"rewards/margins": 0.5556143522262573,
"rewards/rejected": 0.2413407862186432,
"step": 510
},
{
"epoch": 0.1480637813211845,
"grad_norm": 5.260079860687256,
"learning_rate": 1.408608200455581e-05,
"logits/chosen": 2.0790677070617676,
"logits/rejected": 2.0622916221618652,
"logps/chosen": -191.6678009033203,
"logps/rejected": -195.73336791992188,
"loss": 0.5373,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.02513580396771431,
"rewards/margins": 0.46241217851638794,
"rewards/rejected": -0.48754796385765076,
"step": 520
},
{
"epoch": 0.15091116173120728,
"grad_norm": 6.061770439147949,
"learning_rate": 1.4085814350797267e-05,
"logits/chosen": 1.8206462860107422,
"logits/rejected": 1.8210747241973877,
"logps/chosen": -189.5788116455078,
"logps/rejected": -192.89532470703125,
"loss": 0.6216,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": -0.11778483539819717,
"rewards/margins": 0.24859514832496643,
"rewards/rejected": -0.3663800060749054,
"step": 530
},
{
"epoch": 0.15375854214123008,
"grad_norm": 7.366477012634277,
"learning_rate": 1.4085546697038725e-05,
"logits/chosen": 1.4823158979415894,
"logits/rejected": 1.4424474239349365,
"logps/chosen": -189.91629028320312,
"logps/rejected": -194.8428497314453,
"loss": 0.6584,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": 0.1267489641904831,
"rewards/margins": 0.19468553364276886,
"rewards/rejected": -0.06793657690286636,
"step": 540
},
{
"epoch": 0.15660592255125286,
"grad_norm": 6.191103935241699,
"learning_rate": 1.4085279043280183e-05,
"logits/chosen": 1.7889503240585327,
"logits/rejected": 1.7639633417129517,
"logps/chosen": -180.61203002929688,
"logps/rejected": -184.73779296875,
"loss": 0.5676,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.6674291491508484,
"rewards/margins": 0.39728331565856934,
"rewards/rejected": 0.27014586329460144,
"step": 550
},
{
"epoch": 0.15945330296127563,
"grad_norm": 5.199939250946045,
"learning_rate": 1.408501138952164e-05,
"logits/chosen": 1.800641417503357,
"logits/rejected": 1.7895002365112305,
"logps/chosen": -181.73495483398438,
"logps/rejected": -189.62301635742188,
"loss": 0.5967,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": 0.6316097974777222,
"rewards/margins": 0.381971538066864,
"rewards/rejected": 0.24963828921318054,
"step": 560
},
{
"epoch": 0.1623006833712984,
"grad_norm": 6.458276271820068,
"learning_rate": 1.4084743735763098e-05,
"logits/chosen": 1.6199853420257568,
"logits/rejected": 1.6275495290756226,
"logps/chosen": -188.76553344726562,
"logps/rejected": -193.72213745117188,
"loss": 0.5953,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.44772014021873474,
"rewards/margins": 0.34694141149520874,
"rewards/rejected": 0.10077869892120361,
"step": 570
},
{
"epoch": 0.16514806378132119,
"grad_norm": 8.612196922302246,
"learning_rate": 1.4084476082004556e-05,
"logits/chosen": 1.5094552040100098,
"logits/rejected": 1.4994395971298218,
"logps/chosen": -183.19235229492188,
"logps/rejected": -188.16197204589844,
"loss": 0.56,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.45164498686790466,
"rewards/margins": 0.4487348198890686,
"rewards/rejected": 0.002910163952037692,
"step": 580
},
{
"epoch": 0.16799544419134396,
"grad_norm": 7.949935436248779,
"learning_rate": 1.4084208428246014e-05,
"logits/chosen": 1.9034900665283203,
"logits/rejected": 1.8692665100097656,
"logps/chosen": -188.60250854492188,
"logps/rejected": -186.42356872558594,
"loss": 0.5381,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.6019371747970581,
"rewards/margins": 0.49507027864456177,
"rewards/rejected": 0.10686691105365753,
"step": 590
},
{
"epoch": 0.17084282460136674,
"grad_norm": 6.373099327087402,
"learning_rate": 1.408394077448747e-05,
"logits/chosen": 1.8054128885269165,
"logits/rejected": 1.7788407802581787,
"logps/chosen": -186.22625732421875,
"logps/rejected": -188.0371856689453,
"loss": 0.5623,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.30558791756629944,
"rewards/margins": 0.4337770938873291,
"rewards/rejected": -0.12818923592567444,
"step": 600
},
{
"epoch": 0.17369020501138951,
"grad_norm": 7.086456775665283,
"learning_rate": 1.4083673120728929e-05,
"logits/chosen": 1.990748643875122,
"logits/rejected": 1.954580307006836,
"logps/chosen": -190.6208953857422,
"logps/rejected": -194.99554443359375,
"loss": 0.5549,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": 0.09909350425004959,
"rewards/margins": 0.45813828706741333,
"rewards/rejected": -0.35904479026794434,
"step": 610
},
{
"epoch": 0.1765375854214123,
"grad_norm": 5.860842227935791,
"learning_rate": 1.4083405466970387e-05,
"logits/chosen": 1.4852467775344849,
"logits/rejected": 1.486603021621704,
"logps/chosen": -189.75148010253906,
"logps/rejected": -189.5113067626953,
"loss": 0.5515,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.12619808316230774,
"rewards/margins": 0.5254445672035217,
"rewards/rejected": -0.3992464542388916,
"step": 620
},
{
"epoch": 0.17938496583143507,
"grad_norm": 6.520554542541504,
"learning_rate": 1.4083137813211845e-05,
"logits/chosen": 1.6842315196990967,
"logits/rejected": 1.679579734802246,
"logps/chosen": -186.0720672607422,
"logps/rejected": -186.9860076904297,
"loss": 0.6135,
"rewards/accuracies": 0.6333333849906921,
"rewards/chosen": 0.2344587743282318,
"rewards/margins": 0.41377443075180054,
"rewards/rejected": -0.17931564152240753,
"step": 630
},
{
"epoch": 0.18223234624145787,
"grad_norm": 5.756778240203857,
"learning_rate": 1.4082870159453303e-05,
"logits/chosen": 1.7839409112930298,
"logits/rejected": 1.758226990699768,
"logps/chosen": -185.8971405029297,
"logps/rejected": -195.45352172851562,
"loss": 0.5255,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": 0.22731363773345947,
"rewards/margins": 0.5177304148674011,
"rewards/rejected": -0.29041680693626404,
"step": 640
},
{
"epoch": 0.18507972665148065,
"grad_norm": 6.184617519378662,
"learning_rate": 1.4082602505694762e-05,
"logits/chosen": 1.5313549041748047,
"logits/rejected": 1.5081932544708252,
"logps/chosen": -190.99046325683594,
"logps/rejected": -195.67564392089844,
"loss": 0.5314,
"rewards/accuracies": 0.6999999284744263,
"rewards/chosen": 0.30462446808815,
"rewards/margins": 0.598679780960083,
"rewards/rejected": -0.294055312871933,
"step": 650
},
{
"epoch": 0.18792710706150342,
"grad_norm": 6.379988670349121,
"learning_rate": 1.408233485193622e-05,
"logits/chosen": 1.446187973022461,
"logits/rejected": 1.4404445886611938,
"logps/chosen": -186.88339233398438,
"logps/rejected": -192.68328857421875,
"loss": 0.5728,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.21411502361297607,
"rewards/margins": 0.503459632396698,
"rewards/rejected": -0.2893446087837219,
"step": 660
},
{
"epoch": 0.1907744874715262,
"grad_norm": 6.756308078765869,
"learning_rate": 1.4082067198177676e-05,
"logits/chosen": 1.5809494256973267,
"logits/rejected": 1.5566542148590088,
"logps/chosen": -193.43441772460938,
"logps/rejected": -194.9256134033203,
"loss": 0.5794,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": 0.1698226034641266,
"rewards/margins": 0.48203667998313904,
"rewards/rejected": -0.31221404671669006,
"step": 670
},
{
"epoch": 0.19362186788154898,
"grad_norm": 8.186720848083496,
"learning_rate": 1.4081799544419134e-05,
"logits/chosen": 1.4054545164108276,
"logits/rejected": 1.389957070350647,
"logps/chosen": -184.97232055664062,
"logps/rejected": -192.24832153320312,
"loss": 0.4763,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": 0.35529452562332153,
"rewards/margins": 0.6863371133804321,
"rewards/rejected": -0.3310425579547882,
"step": 680
},
{
"epoch": 0.19646924829157175,
"grad_norm": 6.5290937423706055,
"learning_rate": 1.4081531890660593e-05,
"logits/chosen": 1.960597038269043,
"logits/rejected": 1.9493227005004883,
"logps/chosen": -190.00778198242188,
"logps/rejected": -194.88388061523438,
"loss": 0.5296,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3141848146915436,
"rewards/margins": 0.6257708072662354,
"rewards/rejected": -0.3115859925746918,
"step": 690
},
{
"epoch": 0.19931662870159453,
"grad_norm": 8.158761978149414,
"learning_rate": 1.408126423690205e-05,
"logits/chosen": 1.9396240711212158,
"logits/rejected": 1.901085615158081,
"logps/chosen": -181.5595703125,
"logps/rejected": -188.5738525390625,
"loss": 0.5346,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": 0.8007356524467468,
"rewards/margins": 0.5970959663391113,
"rewards/rejected": 0.20363974571228027,
"step": 700
},
{
"epoch": 0.2021640091116173,
"grad_norm": 7.908491611480713,
"learning_rate": 1.4080996583143509e-05,
"logits/chosen": 2.1133837699890137,
"logits/rejected": 2.0991783142089844,
"logps/chosen": -186.72463989257812,
"logps/rejected": -191.86631774902344,
"loss": 0.5594,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.43003392219543457,
"rewards/margins": 0.4883079528808594,
"rewards/rejected": -0.05827409029006958,
"step": 710
},
{
"epoch": 0.20501138952164008,
"grad_norm": 4.833853721618652,
"learning_rate": 1.4080728929384967e-05,
"logits/chosen": 1.790161371231079,
"logits/rejected": 1.7500584125518799,
"logps/chosen": -184.5858154296875,
"logps/rejected": -193.99765014648438,
"loss": 0.5817,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": 0.5971255898475647,
"rewards/margins": 0.5937383770942688,
"rewards/rejected": 0.003387200878933072,
"step": 720
},
{
"epoch": 0.20785876993166286,
"grad_norm": 6.047511100769043,
"learning_rate": 1.4080461275626425e-05,
"logits/chosen": 1.7548940181732178,
"logits/rejected": 1.7236839532852173,
"logps/chosen": -182.98989868164062,
"logps/rejected": -190.8938446044922,
"loss": 0.5335,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.427555650472641,
"rewards/margins": 0.56451416015625,
"rewards/rejected": -0.136958509683609,
"step": 730
},
{
"epoch": 0.21070615034168566,
"grad_norm": 13.955177307128906,
"learning_rate": 1.4080193621867883e-05,
"logits/chosen": 1.519672155380249,
"logits/rejected": 1.4972164630889893,
"logps/chosen": -189.01983642578125,
"logps/rejected": -195.07350158691406,
"loss": 0.5435,
"rewards/accuracies": 0.6833332777023315,
"rewards/chosen": -0.3326644003391266,
"rewards/margins": 0.5098401308059692,
"rewards/rejected": -0.8425045013427734,
"step": 740
},
{
"epoch": 0.21355353075170844,
"grad_norm": 9.393014907836914,
"learning_rate": 1.407992596810934e-05,
"logits/chosen": 1.8645124435424805,
"logits/rejected": 1.8256851434707642,
"logps/chosen": -192.75039672851562,
"logps/rejected": -196.81118774414062,
"loss": 0.5669,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -0.15597638487815857,
"rewards/margins": 0.5917509198188782,
"rewards/rejected": -0.7477271556854248,
"step": 750
},
{
"epoch": 0.2164009111617312,
"grad_norm": 13.501594543457031,
"learning_rate": 1.4079658314350798e-05,
"logits/chosen": 2.1007590293884277,
"logits/rejected": 2.077477216720581,
"logps/chosen": -191.407958984375,
"logps/rejected": -195.40225219726562,
"loss": 0.551,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.4909743666648865,
"rewards/margins": 0.5510894060134888,
"rewards/rejected": -1.04206383228302,
"step": 760
},
{
"epoch": 0.219248291571754,
"grad_norm": 11.767430305480957,
"learning_rate": 1.4079390660592256e-05,
"logits/chosen": 2.3974993228912354,
"logits/rejected": 2.375783681869507,
"logps/chosen": -193.2632293701172,
"logps/rejected": -201.26290893554688,
"loss": 0.4595,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -0.15944749116897583,
"rewards/margins": 0.7945913076400757,
"rewards/rejected": -0.9540387988090515,
"step": 770
},
{
"epoch": 0.22209567198177677,
"grad_norm": 7.549858570098877,
"learning_rate": 1.4079123006833714e-05,
"logits/chosen": 1.6657108068466187,
"logits/rejected": 1.6228992938995361,
"logps/chosen": -179.9643096923828,
"logps/rejected": -191.71475219726562,
"loss": 0.4792,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": 0.4781853258609772,
"rewards/margins": 0.878165066242218,
"rewards/rejected": -0.39997971057891846,
"step": 780
},
{
"epoch": 0.22494305239179954,
"grad_norm": 10.577963829040527,
"learning_rate": 1.4078855353075171e-05,
"logits/chosen": 1.9104713201522827,
"logits/rejected": 1.869380235671997,
"logps/chosen": -182.4436492919922,
"logps/rejected": -191.72471618652344,
"loss": 0.5378,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.591245174407959,
"rewards/margins": 0.5981950163841248,
"rewards/rejected": -0.006949782371520996,
"step": 790
},
{
"epoch": 0.22779043280182232,
"grad_norm": 10.107033729553223,
"learning_rate": 1.4078587699316629e-05,
"logits/chosen": 1.9709075689315796,
"logits/rejected": 1.941165566444397,
"logps/chosen": -187.01475524902344,
"logps/rejected": -188.1396484375,
"loss": 0.6002,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.21675971150398254,
"rewards/margins": 0.5140406489372253,
"rewards/rejected": -0.29728102684020996,
"step": 800
},
{
"epoch": 0.2306378132118451,
"grad_norm": 10.437517166137695,
"learning_rate": 1.4078320045558087e-05,
"logits/chosen": 1.768145203590393,
"logits/rejected": 1.756650686264038,
"logps/chosen": -192.96310424804688,
"logps/rejected": -203.16757202148438,
"loss": 0.6074,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.4098523259162903,
"rewards/margins": 0.6148914694786072,
"rewards/rejected": -1.024743914604187,
"step": 810
},
{
"epoch": 0.23348519362186787,
"grad_norm": 5.297072410583496,
"learning_rate": 1.4078052391799544e-05,
"logits/chosen": 1.579408884048462,
"logits/rejected": 1.557755708694458,
"logps/chosen": -195.23289489746094,
"logps/rejected": -202.75173950195312,
"loss": 0.5093,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -0.551082193851471,
"rewards/margins": 0.6868919730186462,
"rewards/rejected": -1.2379741668701172,
"step": 820
},
{
"epoch": 0.23633257403189067,
"grad_norm": 8.658047676086426,
"learning_rate": 1.4077784738041002e-05,
"logits/chosen": 1.7794520854949951,
"logits/rejected": 1.763954520225525,
"logps/chosen": -191.802734375,
"logps/rejected": -201.72024536132812,
"loss": 0.512,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -0.3704865276813507,
"rewards/margins": 0.8075221180915833,
"rewards/rejected": -1.1780085563659668,
"step": 830
},
{
"epoch": 0.23917995444191345,
"grad_norm": 6.875248432159424,
"learning_rate": 1.407751708428246e-05,
"logits/chosen": 1.9156545400619507,
"logits/rejected": 1.8953126668930054,
"logps/chosen": -187.77313232421875,
"logps/rejected": -195.1903076171875,
"loss": 0.535,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.16680260002613068,
"rewards/margins": 0.6953068971633911,
"rewards/rejected": -0.5285042524337769,
"step": 840
},
{
"epoch": 0.24202733485193623,
"grad_norm": 7.518496513366699,
"learning_rate": 1.4077249430523918e-05,
"logits/chosen": 1.585115671157837,
"logits/rejected": 1.5631306171417236,
"logps/chosen": -185.40744018554688,
"logps/rejected": -194.33627319335938,
"loss": 0.5249,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": 0.23953166604042053,
"rewards/margins": 0.7418320178985596,
"rewards/rejected": -0.5023003220558167,
"step": 850
},
{
"epoch": 0.244874715261959,
"grad_norm": 7.626437664031982,
"learning_rate": 1.4076981776765376e-05,
"logits/chosen": 2.024663209915161,
"logits/rejected": 1.9502818584442139,
"logps/chosen": -188.0728302001953,
"logps/rejected": -194.77236938476562,
"loss": 0.4572,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": 0.18363861739635468,
"rewards/margins": 0.8329526782035828,
"rewards/rejected": -0.6493140459060669,
"step": 860
},
{
"epoch": 0.24772209567198178,
"grad_norm": 6.028398036956787,
"learning_rate": 1.4076714123006835e-05,
"logits/chosen": 2.0717482566833496,
"logits/rejected": 2.015554904937744,
"logps/chosen": -190.80706787109375,
"logps/rejected": -197.00929260253906,
"loss": 0.5263,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.23343093693256378,
"rewards/margins": 0.7314363121986389,
"rewards/rejected": -0.9648672938346863,
"step": 870
},
{
"epoch": 0.2505694760820046,
"grad_norm": 11.498568534851074,
"learning_rate": 1.4076446469248293e-05,
"logits/chosen": 1.7024204730987549,
"logits/rejected": 1.6783952713012695,
"logps/chosen": -189.88160705566406,
"logps/rejected": -200.33108520507812,
"loss": 0.5148,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -0.3500627875328064,
"rewards/margins": 0.6954061388969421,
"rewards/rejected": -1.045469045639038,
"step": 880
},
{
"epoch": 0.25341685649202733,
"grad_norm": 8.39510440826416,
"learning_rate": 1.407617881548975e-05,
"logits/chosen": 2.072392225265503,
"logits/rejected": 1.9953988790512085,
"logps/chosen": -184.83578491210938,
"logps/rejected": -192.18081665039062,
"loss": 0.4797,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.27530142664909363,
"rewards/margins": 0.8618084192276001,
"rewards/rejected": -0.5865069627761841,
"step": 890
},
{
"epoch": 0.25626423690205014,
"grad_norm": 7.497321605682373,
"learning_rate": 1.4075911161731207e-05,
"logits/chosen": 2.0645341873168945,
"logits/rejected": 2.0675666332244873,
"logps/chosen": -191.05245971679688,
"logps/rejected": -191.71282958984375,
"loss": 0.5487,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": 0.19750186800956726,
"rewards/margins": 0.5128003358840942,
"rewards/rejected": -0.3152984380722046,
"step": 900
},
{
"epoch": 0.2591116173120729,
"grad_norm": 7.78532600402832,
"learning_rate": 1.4075643507972666e-05,
"logits/chosen": 1.673762321472168,
"logits/rejected": 1.6521574258804321,
"logps/chosen": -186.23394775390625,
"logps/rejected": -194.6929931640625,
"loss": 0.5387,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": 0.042056869715452194,
"rewards/margins": 0.629268229007721,
"rewards/rejected": -0.5872113108634949,
"step": 910
},
{
"epoch": 0.2619589977220957,
"grad_norm": 9.374770164489746,
"learning_rate": 1.4075375854214124e-05,
"logits/chosen": 2.255173444747925,
"logits/rejected": 2.217841625213623,
"logps/chosen": -181.05465698242188,
"logps/rejected": -191.16636657714844,
"loss": 0.4878,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": 0.2503889501094818,
"rewards/margins": 0.7921093702316284,
"rewards/rejected": -0.541720449924469,
"step": 920
},
{
"epoch": 0.26480637813211844,
"grad_norm": 8.376593589782715,
"learning_rate": 1.4075108200455582e-05,
"logits/chosen": 1.7911618947982788,
"logits/rejected": 1.7365095615386963,
"logps/chosen": -197.14320373535156,
"logps/rejected": -204.03855895996094,
"loss": 0.416,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -0.07821293920278549,
"rewards/margins": 1.092116117477417,
"rewards/rejected": -1.1703290939331055,
"step": 930
},
{
"epoch": 0.26765375854214124,
"grad_norm": 12.098174095153809,
"learning_rate": 1.407484054669704e-05,
"logits/chosen": 1.771426796913147,
"logits/rejected": 1.7347352504730225,
"logps/chosen": -190.84811401367188,
"logps/rejected": -202.90049743652344,
"loss": 0.4819,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -0.07078223675489426,
"rewards/margins": 0.9916135668754578,
"rewards/rejected": -1.0623959302902222,
"step": 940
},
{
"epoch": 0.270501138952164,
"grad_norm": 13.370096206665039,
"learning_rate": 1.4074572892938498e-05,
"logits/chosen": 2.4168200492858887,
"logits/rejected": 2.3770642280578613,
"logps/chosen": -197.3751678466797,
"logps/rejected": -205.10140991210938,
"loss": 0.4909,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.508063018321991,
"rewards/margins": 0.8573676943778992,
"rewards/rejected": -1.3654309511184692,
"step": 950
},
{
"epoch": 0.2733485193621868,
"grad_norm": 8.180644989013672,
"learning_rate": 1.4074305239179955e-05,
"logits/chosen": 1.9371917247772217,
"logits/rejected": 1.903172492980957,
"logps/chosen": -203.6707000732422,
"logps/rejected": -210.0899200439453,
"loss": 0.557,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -1.0856568813323975,
"rewards/margins": 0.657774806022644,
"rewards/rejected": -1.7434314489364624,
"step": 960
},
{
"epoch": 0.27619589977220954,
"grad_norm": 7.428178787231445,
"learning_rate": 1.4074037585421413e-05,
"logits/chosen": 1.4970729351043701,
"logits/rejected": 1.4920985698699951,
"logps/chosen": -197.6659698486328,
"logps/rejected": -204.35134887695312,
"loss": 0.4897,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.8745123147964478,
"rewards/margins": 0.7961224913597107,
"rewards/rejected": -1.6706346273422241,
"step": 970
},
{
"epoch": 0.27904328018223234,
"grad_norm": 9.783337593078613,
"learning_rate": 1.4073769931662871e-05,
"logits/chosen": 1.8168764114379883,
"logits/rejected": 1.7757813930511475,
"logps/chosen": -190.90243530273438,
"logps/rejected": -200.79592895507812,
"loss": 0.4804,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.17690226435661316,
"rewards/margins": 0.8342889547348022,
"rewards/rejected": -1.0111911296844482,
"step": 980
},
{
"epoch": 0.28189066059225515,
"grad_norm": 10.380950927734375,
"learning_rate": 1.407350227790433e-05,
"logits/chosen": 1.904358148574829,
"logits/rejected": 1.8814504146575928,
"logps/chosen": -192.22409057617188,
"logps/rejected": -194.5697021484375,
"loss": 0.5484,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.04954533651471138,
"rewards/margins": 0.7774869799613953,
"rewards/rejected": -0.727941632270813,
"step": 990
},
{
"epoch": 0.2847380410022779,
"grad_norm": 12.717456817626953,
"learning_rate": 1.4073234624145788e-05,
"logits/chosen": 1.9216387271881104,
"logits/rejected": 1.909235954284668,
"logps/chosen": -189.25656127929688,
"logps/rejected": -189.6717987060547,
"loss": 0.6481,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": 0.24547600746154785,
"rewards/margins": 0.46644410490989685,
"rewards/rejected": -0.22096815705299377,
"step": 1000
},
{
"epoch": 0.2875854214123007,
"grad_norm": 8.20633316040039,
"learning_rate": 1.4072966970387244e-05,
"logits/chosen": 2.119447708129883,
"logits/rejected": 2.0835766792297363,
"logps/chosen": -185.17373657226562,
"logps/rejected": -197.90789794921875,
"loss": 0.4552,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": 0.463967889547348,
"rewards/margins": 0.9948002099990845,
"rewards/rejected": -0.5308324098587036,
"step": 1010
},
{
"epoch": 0.29043280182232345,
"grad_norm": 7.67072868347168,
"learning_rate": 1.4072699316628702e-05,
"logits/chosen": 2.101510763168335,
"logits/rejected": 2.0747411251068115,
"logps/chosen": -189.11227416992188,
"logps/rejected": -199.26864624023438,
"loss": 0.4964,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.03253510594367981,
"rewards/margins": 0.7407889366149902,
"rewards/rejected": -0.7733240723609924,
"step": 1020
},
{
"epoch": 0.29328018223234625,
"grad_norm": 2.5675323009490967,
"learning_rate": 1.4072431662870159e-05,
"logits/chosen": 1.761235237121582,
"logits/rejected": 1.69741952419281,
"logps/chosen": -192.15017700195312,
"logps/rejected": -196.58164978027344,
"loss": 0.3807,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": 0.30114102363586426,
"rewards/margins": 1.2773263454437256,
"rewards/rejected": -0.9761852025985718,
"step": 1030
},
{
"epoch": 0.296127562642369,
"grad_norm": 6.229589462280273,
"learning_rate": 1.4072164009111617e-05,
"logits/chosen": 1.8937768936157227,
"logits/rejected": 1.8909927606582642,
"logps/chosen": -192.98153686523438,
"logps/rejected": -198.7643585205078,
"loss": 0.4607,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.3074316382408142,
"rewards/margins": 0.925214946269989,
"rewards/rejected": -1.2326464653015137,
"step": 1040
},
{
"epoch": 0.2989749430523918,
"grad_norm": 21.918458938598633,
"learning_rate": 1.4071896355353075e-05,
"logits/chosen": 1.5101830959320068,
"logits/rejected": 1.4930336475372314,
"logps/chosen": -187.5404510498047,
"logps/rejected": -200.8413543701172,
"loss": 0.5419,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": 0.17931926250457764,
"rewards/margins": 0.7964944839477539,
"rewards/rejected": -0.6171752214431763,
"step": 1050
},
{
"epoch": 0.30182232346241455,
"grad_norm": 8.404467582702637,
"learning_rate": 1.4071628701594533e-05,
"logits/chosen": 1.6592843532562256,
"logits/rejected": 1.675798773765564,
"logps/chosen": -191.4521026611328,
"logps/rejected": -199.44200134277344,
"loss": 0.4875,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.2935402989387512,
"rewards/margins": 1.0806810855865479,
"rewards/rejected": -1.3742212057113647,
"step": 1060
},
{
"epoch": 0.30466970387243736,
"grad_norm": 8.605794906616211,
"learning_rate": 1.4071361047835991e-05,
"logits/chosen": 2.1089935302734375,
"logits/rejected": 2.0790882110595703,
"logps/chosen": -203.49526977539062,
"logps/rejected": -208.977294921875,
"loss": 0.4672,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7177016139030457,
"rewards/margins": 0.9588042497634888,
"rewards/rejected": -1.6765056848526,
"step": 1070
},
{
"epoch": 0.30751708428246016,
"grad_norm": 10.008315086364746,
"learning_rate": 1.407109339407745e-05,
"logits/chosen": 1.850354790687561,
"logits/rejected": 1.805641770362854,
"logps/chosen": -195.0923309326172,
"logps/rejected": -202.8920440673828,
"loss": 0.5457,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -0.3235352039337158,
"rewards/margins": 0.6880014538764954,
"rewards/rejected": -1.011536717414856,
"step": 1080
},
{
"epoch": 0.3103644646924829,
"grad_norm": 13.152917861938477,
"learning_rate": 1.4070825740318908e-05,
"logits/chosen": 1.8469655513763428,
"logits/rejected": 1.8281362056732178,
"logps/chosen": -199.08865356445312,
"logps/rejected": -203.9534912109375,
"loss": 0.5631,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -0.3010219931602478,
"rewards/margins": 0.7448440790176392,
"rewards/rejected": -1.0458661317825317,
"step": 1090
},
{
"epoch": 0.3132118451025057,
"grad_norm": 13.376197814941406,
"learning_rate": 1.4070558086560364e-05,
"logits/chosen": 2.043653964996338,
"logits/rejected": 1.949476957321167,
"logps/chosen": -199.87838745117188,
"logps/rejected": -205.461181640625,
"loss": 0.4732,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.33904480934143066,
"rewards/margins": 1.0151432752609253,
"rewards/rejected": -1.354188084602356,
"step": 1100
},
{
"epoch": 0.31605922551252846,
"grad_norm": 11.804425239562988,
"learning_rate": 1.4070290432801822e-05,
"logits/chosen": 1.7380597591400146,
"logits/rejected": 1.7194397449493408,
"logps/chosen": -196.87367248535156,
"logps/rejected": -205.9291229248047,
"loss": 0.4909,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6721280813217163,
"rewards/margins": 0.8940645456314087,
"rewards/rejected": -1.566192626953125,
"step": 1110
},
{
"epoch": 0.31890660592255127,
"grad_norm": 11.524681091308594,
"learning_rate": 1.407002277904328e-05,
"logits/chosen": 1.8095728158950806,
"logits/rejected": 1.7724698781967163,
"logps/chosen": -194.70455932617188,
"logps/rejected": -205.51736450195312,
"loss": 0.3973,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.4333480894565582,
"rewards/margins": 1.3439867496490479,
"rewards/rejected": -1.7773349285125732,
"step": 1120
},
{
"epoch": 0.321753986332574,
"grad_norm": 10.198122024536133,
"learning_rate": 1.4069755125284739e-05,
"logits/chosen": 1.5907586812973022,
"logits/rejected": 1.554947853088379,
"logps/chosen": -198.55319213867188,
"logps/rejected": -208.1469268798828,
"loss": 0.3429,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -0.379428893327713,
"rewards/margins": 1.4134459495544434,
"rewards/rejected": -1.7928749322891235,
"step": 1130
},
{
"epoch": 0.3246013667425968,
"grad_norm": 11.933664321899414,
"learning_rate": 1.4069487471526197e-05,
"logits/chosen": 2.073201894760132,
"logits/rejected": 2.0558342933654785,
"logps/chosen": -196.92529296875,
"logps/rejected": -207.8292694091797,
"loss": 0.492,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.7831190824508667,
"rewards/margins": 0.9773709177970886,
"rewards/rejected": -1.7604900598526,
"step": 1140
},
{
"epoch": 0.32744874715261957,
"grad_norm": 10.23611831665039,
"learning_rate": 1.4069219817767655e-05,
"logits/chosen": 1.8735500574111938,
"logits/rejected": 1.8228156566619873,
"logps/chosen": -198.6478271484375,
"logps/rejected": -211.98245239257812,
"loss": 0.533,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -1.0398519039154053,
"rewards/margins": 0.9020865559577942,
"rewards/rejected": -1.9419386386871338,
"step": 1150
},
{
"epoch": 0.33029612756264237,
"grad_norm": 8.893653869628906,
"learning_rate": 1.4068952164009113e-05,
"logits/chosen": 1.9107131958007812,
"logits/rejected": 1.891579031944275,
"logps/chosen": -197.96861267089844,
"logps/rejected": -205.613037109375,
"loss": 0.5612,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.6610264182090759,
"rewards/margins": 0.8070106506347656,
"rewards/rejected": -1.4680370092391968,
"step": 1160
},
{
"epoch": 0.3331435079726651,
"grad_norm": 6.878332614898682,
"learning_rate": 1.406868451025057e-05,
"logits/chosen": 1.7560676336288452,
"logits/rejected": 1.7299124002456665,
"logps/chosen": -192.83486938476562,
"logps/rejected": -201.4220733642578,
"loss": 0.4402,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.16509667038917542,
"rewards/margins": 1.2705130577087402,
"rewards/rejected": -1.4356096982955933,
"step": 1170
},
{
"epoch": 0.3359908883826879,
"grad_norm": 13.861001014709473,
"learning_rate": 1.4068416856492028e-05,
"logits/chosen": 2.2744829654693604,
"logits/rejected": 2.2301721572875977,
"logps/chosen": -191.3073272705078,
"logps/rejected": -203.01943969726562,
"loss": 0.4838,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.39433032274246216,
"rewards/margins": 1.16512930393219,
"rewards/rejected": -1.5594595670700073,
"step": 1180
},
{
"epoch": 0.33883826879271073,
"grad_norm": 8.740705490112305,
"learning_rate": 1.4068149202733486e-05,
"logits/chosen": 2.088308095932007,
"logits/rejected": 2.0627994537353516,
"logps/chosen": -197.44985961914062,
"logps/rejected": -206.495361328125,
"loss": 0.5276,
"rewards/accuracies": 0.6833332777023315,
"rewards/chosen": -0.9009159207344055,
"rewards/margins": 0.9384552836418152,
"rewards/rejected": -1.8393710851669312,
"step": 1190
},
{
"epoch": 0.3416856492027335,
"grad_norm": 12.754415512084961,
"learning_rate": 1.4067881548974944e-05,
"logits/chosen": 1.8001388311386108,
"logits/rejected": 1.7345482110977173,
"logps/chosen": -204.52352905273438,
"logps/rejected": -215.9031524658203,
"loss": 0.4862,
"rewards/accuracies": 0.7500001192092896,
"rewards/chosen": -1.0903289318084717,
"rewards/margins": 1.0529909133911133,
"rewards/rejected": -2.143319606781006,
"step": 1200
},
{
"epoch": 0.3445330296127563,
"grad_norm": 12.682507514953613,
"learning_rate": 1.4067613895216402e-05,
"logits/chosen": 1.8131275177001953,
"logits/rejected": 1.7931241989135742,
"logps/chosen": -192.96182250976562,
"logps/rejected": -201.81529235839844,
"loss": 0.5983,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.5940048098564148,
"rewards/margins": 0.8659802675247192,
"rewards/rejected": -1.4599850177764893,
"step": 1210
},
{
"epoch": 0.34738041002277903,
"grad_norm": 5.179973602294922,
"learning_rate": 1.406734624145786e-05,
"logits/chosen": 1.7856050729751587,
"logits/rejected": 1.7136350870132446,
"logps/chosen": -190.21469116210938,
"logps/rejected": -199.30459594726562,
"loss": 0.4641,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.08599545061588287,
"rewards/margins": 1.0723307132720947,
"rewards/rejected": -0.9863353967666626,
"step": 1220
},
{
"epoch": 0.35022779043280183,
"grad_norm": 9.299216270446777,
"learning_rate": 1.4067078587699317e-05,
"logits/chosen": 2.1109414100646973,
"logits/rejected": 2.084815263748169,
"logps/chosen": -198.41253662109375,
"logps/rejected": -201.63990783691406,
"loss": 0.5133,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.5270928740501404,
"rewards/margins": 0.92596435546875,
"rewards/rejected": -1.453057050704956,
"step": 1230
},
{
"epoch": 0.3530751708428246,
"grad_norm": 4.547011852264404,
"learning_rate": 1.4066810933940774e-05,
"logits/chosen": 1.6101499795913696,
"logits/rejected": 1.5630186796188354,
"logps/chosen": -199.369384765625,
"logps/rejected": -206.1526336669922,
"loss": 0.5698,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": -0.8662067651748657,
"rewards/margins": 0.6912031769752502,
"rewards/rejected": -1.5574098825454712,
"step": 1240
},
{
"epoch": 0.3559225512528474,
"grad_norm": 8.161728858947754,
"learning_rate": 1.4066543280182232e-05,
"logits/chosen": 1.626274824142456,
"logits/rejected": 1.608764410018921,
"logps/chosen": -199.33700561523438,
"logps/rejected": -207.776123046875,
"loss": 0.5068,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -1.0500733852386475,
"rewards/margins": 0.8237847089767456,
"rewards/rejected": -1.873858094215393,
"step": 1250
},
{
"epoch": 0.35876993166287013,
"grad_norm": 9.337955474853516,
"learning_rate": 1.406627562642369e-05,
"logits/chosen": 1.614162802696228,
"logits/rejected": 1.5890023708343506,
"logps/chosen": -191.52984619140625,
"logps/rejected": -204.58090209960938,
"loss": 0.4568,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.9015741348266602,
"rewards/margins": 1.1320171356201172,
"rewards/rejected": -2.0335912704467773,
"step": 1260
},
{
"epoch": 0.36161731207289294,
"grad_norm": 12.726293563842773,
"learning_rate": 1.4066007972665148e-05,
"logits/chosen": 1.9878727197647095,
"logits/rejected": 1.9336633682250977,
"logps/chosen": -196.55551147460938,
"logps/rejected": -206.54592895507812,
"loss": 0.5638,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9223095178604126,
"rewards/margins": 0.6745713949203491,
"rewards/rejected": -1.5968811511993408,
"step": 1270
},
{
"epoch": 0.36446469248291574,
"grad_norm": 5.186792373657227,
"learning_rate": 1.4065740318906606e-05,
"logits/chosen": 2.260460376739502,
"logits/rejected": 2.2322933673858643,
"logps/chosen": -191.57492065429688,
"logps/rejected": -204.7289581298828,
"loss": 0.4348,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.18739666044712067,
"rewards/margins": 1.1786158084869385,
"rewards/rejected": -1.366012454032898,
"step": 1280
},
{
"epoch": 0.3673120728929385,
"grad_norm": 24.165205001831055,
"learning_rate": 1.4065472665148064e-05,
"logits/chosen": 1.9065678119659424,
"logits/rejected": 1.8787645101547241,
"logps/chosen": -190.4572296142578,
"logps/rejected": -203.6009521484375,
"loss": 0.4677,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2891443371772766,
"rewards/margins": 1.2570773363113403,
"rewards/rejected": -0.967933177947998,
"step": 1290
},
{
"epoch": 0.3701594533029613,
"grad_norm": 10.15389633178711,
"learning_rate": 1.4065205011389523e-05,
"logits/chosen": 2.21083664894104,
"logits/rejected": 2.1624722480773926,
"logps/chosen": -191.64566040039062,
"logps/rejected": -203.72608947753906,
"loss": 0.5796,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.5001910924911499,
"rewards/margins": 0.7388169765472412,
"rewards/rejected": -1.2390079498291016,
"step": 1300
},
{
"epoch": 0.37300683371298404,
"grad_norm": 8.427837371826172,
"learning_rate": 1.4064937357630979e-05,
"logits/chosen": 1.5343374013900757,
"logits/rejected": 1.5085475444793701,
"logps/chosen": -198.10842895507812,
"logps/rejected": -214.72476196289062,
"loss": 0.4727,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.6099187135696411,
"rewards/margins": 1.084291696548462,
"rewards/rejected": -1.694210410118103,
"step": 1310
},
{
"epoch": 0.37585421412300685,
"grad_norm": 9.226961135864258,
"learning_rate": 1.4064669703872437e-05,
"logits/chosen": 2.1237213611602783,
"logits/rejected": 2.0830631256103516,
"logps/chosen": -199.69631958007812,
"logps/rejected": -211.4775390625,
"loss": 0.4277,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -0.9815210103988647,
"rewards/margins": 1.2467997074127197,
"rewards/rejected": -2.228320360183716,
"step": 1320
},
{
"epoch": 0.3787015945330296,
"grad_norm": 12.941588401794434,
"learning_rate": 1.4064402050113895e-05,
"logits/chosen": 2.2387449741363525,
"logits/rejected": 2.177280902862549,
"logps/chosen": -207.823974609375,
"logps/rejected": -223.4504852294922,
"loss": 0.3913,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -1.2012609243392944,
"rewards/margins": 1.5182263851165771,
"rewards/rejected": -2.719486951828003,
"step": 1330
},
{
"epoch": 0.3815489749430524,
"grad_norm": 14.2744722366333,
"learning_rate": 1.4064134396355354e-05,
"logits/chosen": 2.0903189182281494,
"logits/rejected": 2.0274603366851807,
"logps/chosen": -204.43905639648438,
"logps/rejected": -217.285888671875,
"loss": 0.4283,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1711370944976807,
"rewards/margins": 1.2676665782928467,
"rewards/rejected": -2.4388039112091064,
"step": 1340
},
{
"epoch": 0.38439635535307515,
"grad_norm": 12.081289291381836,
"learning_rate": 1.4063866742596812e-05,
"logits/chosen": 2.0742850303649902,
"logits/rejected": 2.009190797805786,
"logps/chosen": -202.11859130859375,
"logps/rejected": -212.6031494140625,
"loss": 0.5151,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2778351306915283,
"rewards/margins": 1.1755545139312744,
"rewards/rejected": -2.4533896446228027,
"step": 1350
},
{
"epoch": 0.38724373576309795,
"grad_norm": 11.74995231628418,
"learning_rate": 1.406359908883827e-05,
"logits/chosen": 2.0087525844573975,
"logits/rejected": 1.9561760425567627,
"logps/chosen": -203.6864013671875,
"logps/rejected": -213.94332885742188,
"loss": 0.5604,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": -1.6550267934799194,
"rewards/margins": 0.7960943579673767,
"rewards/rejected": -2.4511213302612305,
"step": 1360
},
{
"epoch": 0.39009111617312076,
"grad_norm": 9.296886444091797,
"learning_rate": 1.4063331435079728e-05,
"logits/chosen": 1.7611463069915771,
"logits/rejected": 1.6809743642807007,
"logps/chosen": -209.0948028564453,
"logps/rejected": -217.0623779296875,
"loss": 0.5996,
"rewards/accuracies": 0.7166667580604553,
"rewards/chosen": -1.9659429788589478,
"rewards/margins": 0.6299344897270203,
"rewards/rejected": -2.5958774089813232,
"step": 1370
},
{
"epoch": 0.3929384965831435,
"grad_norm": 4.803467273712158,
"learning_rate": 1.4063063781321185e-05,
"logits/chosen": 1.5451164245605469,
"logits/rejected": 1.5325957536697388,
"logps/chosen": -208.9802703857422,
"logps/rejected": -213.04342651367188,
"loss": 0.5885,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -1.538881540298462,
"rewards/margins": 0.8458682298660278,
"rewards/rejected": -2.3847498893737793,
"step": 1380
},
{
"epoch": 0.3957858769931663,
"grad_norm": 10.66770076751709,
"learning_rate": 1.4062796127562643e-05,
"logits/chosen": 1.4009406566619873,
"logits/rejected": 1.37747323513031,
"logps/chosen": -204.7423095703125,
"logps/rejected": -205.3677520751953,
"loss": 0.4312,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.7842095494270325,
"rewards/margins": 1.1969635486602783,
"rewards/rejected": -1.9811729192733765,
"step": 1390
},
{
"epoch": 0.39863325740318906,
"grad_norm": 7.478954315185547,
"learning_rate": 1.4062528473804101e-05,
"logits/chosen": 1.744096040725708,
"logits/rejected": 1.6914621591567993,
"logps/chosen": -203.30039978027344,
"logps/rejected": -209.58816528320312,
"loss": 0.4153,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.6862292885780334,
"rewards/margins": 1.180418848991394,
"rewards/rejected": -1.8666483163833618,
"step": 1400
},
{
"epoch": 0.40148063781321186,
"grad_norm": 7.500680923461914,
"learning_rate": 1.4062260820045559e-05,
"logits/chosen": 1.2981688976287842,
"logits/rejected": 1.2812628746032715,
"logps/chosen": -198.92678833007812,
"logps/rejected": -209.11264038085938,
"loss": 0.5276,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.5509114265441895,
"rewards/margins": 0.9838689565658569,
"rewards/rejected": -1.5347803831100464,
"step": 1410
},
{
"epoch": 0.4043280182232346,
"grad_norm": 7.44333553314209,
"learning_rate": 1.4061993166287017e-05,
"logits/chosen": 1.2393553256988525,
"logits/rejected": 1.1910442113876343,
"logps/chosen": -199.60992431640625,
"logps/rejected": -212.3382110595703,
"loss": 0.4608,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7129297852516174,
"rewards/margins": 1.1817357540130615,
"rewards/rejected": -1.8946659564971924,
"step": 1420
},
{
"epoch": 0.4071753986332574,
"grad_norm": 9.176630020141602,
"learning_rate": 1.4061725512528475e-05,
"logits/chosen": 1.7447378635406494,
"logits/rejected": 1.6703119277954102,
"logps/chosen": -198.02667236328125,
"logps/rejected": -206.87356567382812,
"loss": 0.5164,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -0.953807532787323,
"rewards/margins": 0.9193550944328308,
"rewards/rejected": -1.873162865638733,
"step": 1430
},
{
"epoch": 0.41002277904328016,
"grad_norm": 6.571470737457275,
"learning_rate": 1.4061457858769934e-05,
"logits/chosen": 1.5202054977416992,
"logits/rejected": 1.4729554653167725,
"logps/chosen": -204.49246215820312,
"logps/rejected": -215.2471160888672,
"loss": 0.4928,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -1.5182138681411743,
"rewards/margins": 1.0777556896209717,
"rewards/rejected": -2.5959696769714355,
"step": 1440
},
{
"epoch": 0.41287015945330297,
"grad_norm": 11.84085750579834,
"learning_rate": 1.406119020501139e-05,
"logits/chosen": 1.6191060543060303,
"logits/rejected": 1.5997555255889893,
"logps/chosen": -212.749755859375,
"logps/rejected": -218.8915252685547,
"loss": 0.4843,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -1.8460090160369873,
"rewards/margins": 1.0541627407073975,
"rewards/rejected": -2.9001717567443848,
"step": 1450
},
{
"epoch": 0.4157175398633257,
"grad_norm": 5.9276838302612305,
"learning_rate": 1.4060922551252847e-05,
"logits/chosen": 1.4602617025375366,
"logits/rejected": 1.4181015491485596,
"logps/chosen": -205.3323211669922,
"logps/rejected": -209.7090606689453,
"loss": 0.4555,
"rewards/accuracies": 0.76666659116745,
"rewards/chosen": -1.1707617044448853,
"rewards/margins": 1.0197278261184692,
"rewards/rejected": -2.1904895305633545,
"step": 1460
},
{
"epoch": 0.4185649202733485,
"grad_norm": 11.324074745178223,
"learning_rate": 1.4060654897494305e-05,
"logits/chosen": 1.845425009727478,
"logits/rejected": 1.790997862815857,
"logps/chosen": -205.37661743164062,
"logps/rejected": -211.1995086669922,
"loss": 0.5938,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6029345989227295,
"rewards/margins": 0.7839881777763367,
"rewards/rejected": -2.38692307472229,
"step": 1470
},
{
"epoch": 0.4214123006833713,
"grad_norm": 9.8126220703125,
"learning_rate": 1.4060387243735763e-05,
"logits/chosen": 1.5168917179107666,
"logits/rejected": 1.4749294519424438,
"logps/chosen": -200.8393096923828,
"logps/rejected": -207.88693237304688,
"loss": 0.4886,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.8076359033584595,
"rewards/margins": 1.1594288349151611,
"rewards/rejected": -1.967064619064331,
"step": 1480
},
{
"epoch": 0.42425968109339407,
"grad_norm": 13.349224090576172,
"learning_rate": 1.4060119589977221e-05,
"logits/chosen": 1.6546955108642578,
"logits/rejected": 1.641208291053772,
"logps/chosen": -198.56167602539062,
"logps/rejected": -204.73019409179688,
"loss": 0.5426,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.6402263045310974,
"rewards/margins": 0.8992630243301392,
"rewards/rejected": -1.5394892692565918,
"step": 1490
},
{
"epoch": 0.4271070615034169,
"grad_norm": 16.132837295532227,
"learning_rate": 1.405985193621868e-05,
"logits/chosen": 1.2271068096160889,
"logits/rejected": 1.2327044010162354,
"logps/chosen": -201.4064178466797,
"logps/rejected": -205.9292449951172,
"loss": 0.4608,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -1.0444819927215576,
"rewards/margins": 1.0280535221099854,
"rewards/rejected": -2.072535514831543,
"step": 1500
},
{
"epoch": 0.4299544419134396,
"grad_norm": 13.677680969238281,
"learning_rate": 1.4059584282460137e-05,
"logits/chosen": 1.3775099515914917,
"logits/rejected": 1.3772714138031006,
"logps/chosen": -201.0505828857422,
"logps/rejected": -210.51806640625,
"loss": 0.4665,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -1.2761850357055664,
"rewards/margins": 1.0018669366836548,
"rewards/rejected": -2.2780518531799316,
"step": 1510
},
{
"epoch": 0.4328018223234624,
"grad_norm": 13.681148529052734,
"learning_rate": 1.4059316628701594e-05,
"logits/chosen": 1.206789255142212,
"logits/rejected": 1.1529737710952759,
"logps/chosen": -199.44151306152344,
"logps/rejected": -211.2483367919922,
"loss": 0.5924,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2694971561431885,
"rewards/margins": 0.7812899351119995,
"rewards/rejected": -2.0507869720458984,
"step": 1520
},
{
"epoch": 0.4356492027334852,
"grad_norm": 6.829047203063965,
"learning_rate": 1.4059048974943052e-05,
"logits/chosen": 1.3871477842330933,
"logits/rejected": 1.3141772747039795,
"logps/chosen": -201.22048950195312,
"logps/rejected": -206.67276000976562,
"loss": 0.5169,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -1.4746379852294922,
"rewards/margins": 1.0388069152832031,
"rewards/rejected": -2.5134449005126953,
"step": 1530
},
{
"epoch": 0.438496583143508,
"grad_norm": 11.46579360961914,
"learning_rate": 1.405878132118451e-05,
"logits/chosen": 1.273644208908081,
"logits/rejected": 1.2350003719329834,
"logps/chosen": -211.4403839111328,
"logps/rejected": -220.148193359375,
"loss": 0.598,
"rewards/accuracies": 0.6999999284744263,
"rewards/chosen": -1.6227350234985352,
"rewards/margins": 0.9658929705619812,
"rewards/rejected": -2.588628053665161,
"step": 1540
},
{
"epoch": 0.4413439635535307,
"grad_norm": 14.203997611999512,
"learning_rate": 1.4058513667425969e-05,
"logits/chosen": 1.1952084302902222,
"logits/rejected": 1.1850013732910156,
"logps/chosen": -203.4510498046875,
"logps/rejected": -210.08004760742188,
"loss": 0.5225,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -1.5274189710617065,
"rewards/margins": 0.8954464197158813,
"rewards/rejected": -2.422865390777588,
"step": 1550
},
{
"epoch": 0.44419134396355353,
"grad_norm": 7.090624809265137,
"learning_rate": 1.4058246013667427e-05,
"logits/chosen": 1.4373114109039307,
"logits/rejected": 1.3903038501739502,
"logps/chosen": -201.7006378173828,
"logps/rejected": -211.03781127929688,
"loss": 0.5677,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -0.874021053314209,
"rewards/margins": 0.9151598811149597,
"rewards/rejected": -1.7891807556152344,
"step": 1560
},
{
"epoch": 0.44703872437357633,
"grad_norm": 10.441471099853516,
"learning_rate": 1.4057978359908885e-05,
"logits/chosen": 1.3745132684707642,
"logits/rejected": 1.2919235229492188,
"logps/chosen": -198.87559509277344,
"logps/rejected": -217.3492431640625,
"loss": 0.5406,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -1.0884069204330444,
"rewards/margins": 1.036794900894165,
"rewards/rejected": -2.125201940536499,
"step": 1570
},
{
"epoch": 0.4498861047835991,
"grad_norm": 5.564964294433594,
"learning_rate": 1.4057710706150343e-05,
"logits/chosen": 1.195963740348816,
"logits/rejected": 1.1703064441680908,
"logps/chosen": -205.21533203125,
"logps/rejected": -211.69650268554688,
"loss": 0.5185,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -1.2169151306152344,
"rewards/margins": 0.9917265176773071,
"rewards/rejected": -2.208641529083252,
"step": 1580
},
{
"epoch": 0.4527334851936219,
"grad_norm": 5.4993696212768555,
"learning_rate": 1.40574430523918e-05,
"logits/chosen": 0.8286596536636353,
"logits/rejected": 0.8261906504631042,
"logps/chosen": -198.970947265625,
"logps/rejected": -216.66171264648438,
"loss": 0.3587,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.8958838582038879,
"rewards/margins": 1.5535662174224854,
"rewards/rejected": -2.4494500160217285,
"step": 1590
},
{
"epoch": 0.45558086560364464,
"grad_norm": 7.70150089263916,
"learning_rate": 1.4057175398633258e-05,
"logits/chosen": 1.3162205219268799,
"logits/rejected": 1.2919889688491821,
"logps/chosen": -206.515625,
"logps/rejected": -222.2469482421875,
"loss": 0.4,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -1.3980159759521484,
"rewards/margins": 1.3234660625457764,
"rewards/rejected": -2.7214818000793457,
"step": 1600
},
{
"epoch": 0.45842824601366744,
"grad_norm": 11.316675186157227,
"learning_rate": 1.4056907744874716e-05,
"logits/chosen": 1.2483917474746704,
"logits/rejected": 1.2210924625396729,
"logps/chosen": -206.95620727539062,
"logps/rejected": -220.11636352539062,
"loss": 0.4602,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7000877857208252,
"rewards/margins": 1.2964524030685425,
"rewards/rejected": -2.9965403079986572,
"step": 1610
},
{
"epoch": 0.4612756264236902,
"grad_norm": 14.703081130981445,
"learning_rate": 1.4056640091116174e-05,
"logits/chosen": 1.3368771076202393,
"logits/rejected": 1.2918922901153564,
"logps/chosen": -210.63418579101562,
"logps/rejected": -220.60440063476562,
"loss": 0.4989,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -1.670444130897522,
"rewards/margins": 1.2454063892364502,
"rewards/rejected": -2.9158504009246826,
"step": 1620
},
{
"epoch": 0.464123006833713,
"grad_norm": 8.3635892868042,
"learning_rate": 1.4056372437357632e-05,
"logits/chosen": 1.290725588798523,
"logits/rejected": 1.2314881086349487,
"logps/chosen": -208.1670684814453,
"logps/rejected": -216.7353057861328,
"loss": 0.3892,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -1.7449800968170166,
"rewards/margins": 1.2695982456207275,
"rewards/rejected": -3.014578342437744,
"step": 1630
},
{
"epoch": 0.46697038724373574,
"grad_norm": 20.11248779296875,
"learning_rate": 1.405610478359909e-05,
"logits/chosen": 1.5877583026885986,
"logits/rejected": 1.5239537954330444,
"logps/chosen": -208.3041534423828,
"logps/rejected": -223.48318481445312,
"loss": 0.369,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -1.7909959554672241,
"rewards/margins": 1.4336395263671875,
"rewards/rejected": -3.224635362625122,
"step": 1640
},
{
"epoch": 0.46981776765375854,
"grad_norm": 13.5833101272583,
"learning_rate": 1.4055837129840549e-05,
"logits/chosen": 1.738555669784546,
"logits/rejected": 1.6566972732543945,
"logps/chosen": -209.2194061279297,
"logps/rejected": -219.2682647705078,
"loss": 0.4167,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -1.7458181381225586,
"rewards/margins": 1.2833728790283203,
"rewards/rejected": -3.029191255569458,
"step": 1650
},
{
"epoch": 0.47266514806378135,
"grad_norm": 22.821298599243164,
"learning_rate": 1.4055569476082005e-05,
"logits/chosen": 1.3880656957626343,
"logits/rejected": 1.3096697330474854,
"logps/chosen": -203.7255859375,
"logps/rejected": -220.28024291992188,
"loss": 0.4339,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0704998970031738,
"rewards/margins": 1.3066881895065308,
"rewards/rejected": -2.377188205718994,
"step": 1660
},
{
"epoch": 0.4755125284738041,
"grad_norm": 11.496463775634766,
"learning_rate": 1.4055301822323463e-05,
"logits/chosen": 1.432905912399292,
"logits/rejected": 1.3992332220077515,
"logps/chosen": -208.29953002929688,
"logps/rejected": -224.51248168945312,
"loss": 0.4553,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -1.9868751764297485,
"rewards/margins": 1.2016279697418213,
"rewards/rejected": -3.1885030269622803,
"step": 1670
},
{
"epoch": 0.4783599088838269,
"grad_norm": 8.086127281188965,
"learning_rate": 1.405503416856492e-05,
"logits/chosen": 1.6646270751953125,
"logits/rejected": 1.6253557205200195,
"logps/chosen": -216.95669555664062,
"logps/rejected": -232.74746704101562,
"loss": 0.4144,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -2.4913864135742188,
"rewards/margins": 1.5564768314361572,
"rewards/rejected": -4.047863483428955,
"step": 1680
},
{
"epoch": 0.48120728929384965,
"grad_norm": 13.472739219665527,
"learning_rate": 1.4054766514806378e-05,
"logits/chosen": 2.18731427192688,
"logits/rejected": 2.1197452545166016,
"logps/chosen": -221.1719970703125,
"logps/rejected": -232.5544891357422,
"loss": 0.4315,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -2.685852289199829,
"rewards/margins": 1.4282853603363037,
"rewards/rejected": -4.114137172698975,
"step": 1690
},
{
"epoch": 0.48405466970387245,
"grad_norm": 12.491531372070312,
"learning_rate": 1.4054498861047836e-05,
"logits/chosen": 1.754373550415039,
"logits/rejected": 1.7022701501846313,
"logps/chosen": -210.3401641845703,
"logps/rejected": -228.8831024169922,
"loss": 0.3737,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -2.0991439819335938,
"rewards/margins": 1.6580852270126343,
"rewards/rejected": -3.7572293281555176,
"step": 1700
},
{
"epoch": 0.4869020501138952,
"grad_norm": 18.713422775268555,
"learning_rate": 1.4054231207289294e-05,
"logits/chosen": 1.9377641677856445,
"logits/rejected": 1.906818151473999,
"logps/chosen": -217.85159301757812,
"logps/rejected": -230.37515258789062,
"loss": 0.3994,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -2.5920052528381348,
"rewards/margins": 1.5738455057144165,
"rewards/rejected": -4.1658501625061035,
"step": 1710
},
{
"epoch": 0.489749430523918,
"grad_norm": 21.47015380859375,
"learning_rate": 1.4053963553530752e-05,
"logits/chosen": 1.4932907819747925,
"logits/rejected": 1.4301973581314087,
"logps/chosen": -216.68905639648438,
"logps/rejected": -233.25424194335938,
"loss": 0.4647,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -2.9133524894714355,
"rewards/margins": 1.4387586116790771,
"rewards/rejected": -4.352110862731934,
"step": 1720
},
{
"epoch": 0.49259681093394075,
"grad_norm": 14.534845352172852,
"learning_rate": 1.4053695899772209e-05,
"logits/chosen": 1.4016607999801636,
"logits/rejected": 1.3695752620697021,
"logps/chosen": -219.4730682373047,
"logps/rejected": -229.04714965820312,
"loss": 0.6529,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": -3.115053176879883,
"rewards/margins": 1.099458932876587,
"rewards/rejected": -4.214511871337891,
"step": 1730
},
{
"epoch": 0.49544419134396356,
"grad_norm": 21.40360450744629,
"learning_rate": 1.4053428246013667e-05,
"logits/chosen": 1.6236956119537354,
"logits/rejected": 1.583548903465271,
"logps/chosen": -214.4650421142578,
"logps/rejected": -230.59060668945312,
"loss": 0.6043,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -2.8126559257507324,
"rewards/margins": 1.1184018850326538,
"rewards/rejected": -3.9310576915740967,
"step": 1740
},
{
"epoch": 0.4982915717539863,
"grad_norm": 19.227012634277344,
"learning_rate": 1.4053160592255125e-05,
"logits/chosen": 1.6187642812728882,
"logits/rejected": 1.6092262268066406,
"logps/chosen": -218.21994018554688,
"logps/rejected": -234.0594024658203,
"loss": 0.3583,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -2.342740535736084,
"rewards/margins": 1.5823628902435303,
"rewards/rejected": -3.925102949142456,
"step": 1750
},
{
"epoch": 0.5011389521640092,
"grad_norm": 20.58732795715332,
"learning_rate": 1.4052892938496583e-05,
"logits/chosen": 2.2254767417907715,
"logits/rejected": 2.212104320526123,
"logps/chosen": -214.5471954345703,
"logps/rejected": -223.7714080810547,
"loss": 0.5137,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -2.4208734035491943,
"rewards/margins": 1.094592809677124,
"rewards/rejected": -3.5154662132263184,
"step": 1760
},
{
"epoch": 0.5039863325740319,
"grad_norm": 12.883881568908691,
"learning_rate": 1.4052625284738042e-05,
"logits/chosen": 1.7898054122924805,
"logits/rejected": 1.698301076889038,
"logps/chosen": -219.19509887695312,
"logps/rejected": -229.88052368164062,
"loss": 0.4222,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -2.7343392372131348,
"rewards/margins": 1.2842615842819214,
"rewards/rejected": -4.018601417541504,
"step": 1770
},
{
"epoch": 0.5068337129840547,
"grad_norm": 11.711604118347168,
"learning_rate": 1.40523576309795e-05,
"logits/chosen": 1.5762044191360474,
"logits/rejected": 1.5645755529403687,
"logps/chosen": -222.1739959716797,
"logps/rejected": -230.30014038085938,
"loss": 0.5699,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": -2.8136379718780518,
"rewards/margins": 0.9600374102592468,
"rewards/rejected": -3.7736752033233643,
"step": 1780
},
{
"epoch": 0.5096810933940774,
"grad_norm": 13.225159645080566,
"learning_rate": 1.4052089977220958e-05,
"logits/chosen": 1.4274075031280518,
"logits/rejected": 1.366379976272583,
"logps/chosen": -209.89151000976562,
"logps/rejected": -229.04910278320312,
"loss": 0.4188,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -2.8537209033966064,
"rewards/margins": 1.4593942165374756,
"rewards/rejected": -4.313115119934082,
"step": 1790
},
{
"epoch": 0.5125284738041003,
"grad_norm": 12.145174980163574,
"learning_rate": 1.4051822323462414e-05,
"logits/chosen": 1.1467812061309814,
"logits/rejected": 1.1414228677749634,
"logps/chosen": -217.54013061523438,
"logps/rejected": -235.8665771484375,
"loss": 0.5305,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.0461976528167725,
"rewards/margins": 1.3629333972930908,
"rewards/rejected": -4.409131050109863,
"step": 1800
},
{
"epoch": 0.515375854214123,
"grad_norm": 12.6741304397583,
"learning_rate": 1.4051554669703873e-05,
"logits/chosen": 1.9150466918945312,
"logits/rejected": 1.8071298599243164,
"logps/chosen": -220.7820281982422,
"logps/rejected": -233.08389282226562,
"loss": 0.3941,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -2.813331127166748,
"rewards/margins": 1.5690648555755615,
"rewards/rejected": -4.3823957443237305,
"step": 1810
},
{
"epoch": 0.5182232346241458,
"grad_norm": 15.344178199768066,
"learning_rate": 1.405128701594533e-05,
"logits/chosen": 1.4123234748840332,
"logits/rejected": 1.3962544202804565,
"logps/chosen": -217.7815399169922,
"logps/rejected": -224.8722381591797,
"loss": 0.6154,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -3.3274052143096924,
"rewards/margins": 0.9693098068237305,
"rewards/rejected": -4.296715259552002,
"step": 1820
},
{
"epoch": 0.5210706150341685,
"grad_norm": 10.50731086730957,
"learning_rate": 1.4051019362186789e-05,
"logits/chosen": 1.4529472589492798,
"logits/rejected": 1.4127845764160156,
"logps/chosen": -215.39364624023438,
"logps/rejected": -225.745361328125,
"loss": 0.4139,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.4642977714538574,
"rewards/margins": 1.5033318996429443,
"rewards/rejected": -3.9676296710968018,
"step": 1830
},
{
"epoch": 0.5239179954441914,
"grad_norm": 14.13061237335205,
"learning_rate": 1.4050751708428247e-05,
"logits/chosen": 1.4374377727508545,
"logits/rejected": 1.3640098571777344,
"logps/chosen": -216.8865203857422,
"logps/rejected": -234.1781005859375,
"loss": 0.4319,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -2.7498998641967773,
"rewards/margins": 1.7011677026748657,
"rewards/rejected": -4.451067924499512,
"step": 1840
},
{
"epoch": 0.5267653758542141,
"grad_norm": 13.41270923614502,
"learning_rate": 1.4050484054669705e-05,
"logits/chosen": 1.7079713344573975,
"logits/rejected": 1.695387840270996,
"logps/chosen": -222.7223663330078,
"logps/rejected": -232.77090454101562,
"loss": 0.5748,
"rewards/accuracies": 0.7499999403953552,
"rewards/chosen": -3.0100722312927246,
"rewards/margins": 1.1305018663406372,
"rewards/rejected": -4.1405744552612305,
"step": 1850
},
{
"epoch": 0.5296127562642369,
"grad_norm": 12.832780838012695,
"learning_rate": 1.4050216400911163e-05,
"logits/chosen": 2.0568480491638184,
"logits/rejected": 1.9985520839691162,
"logps/chosen": -212.0758819580078,
"logps/rejected": -225.4073486328125,
"loss": 0.4213,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0503463745117188,
"rewards/margins": 1.4921596050262451,
"rewards/rejected": -3.542506456375122,
"step": 1860
},
{
"epoch": 0.5324601366742597,
"grad_norm": 18.81167984008789,
"learning_rate": 1.404994874715262e-05,
"logits/chosen": 1.6365169286727905,
"logits/rejected": 1.6133226156234741,
"logps/chosen": -204.10067749023438,
"logps/rejected": -214.98257446289062,
"loss": 0.6072,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -1.8329346179962158,
"rewards/margins": 0.8236812353134155,
"rewards/rejected": -2.656615972518921,
"step": 1870
},
{
"epoch": 0.5353075170842825,
"grad_norm": 11.395954132080078,
"learning_rate": 1.4049681093394078e-05,
"logits/chosen": 1.637677788734436,
"logits/rejected": 1.5944675207138062,
"logps/chosen": -202.73739624023438,
"logps/rejected": -214.62820434570312,
"loss": 0.5573,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -1.8346195220947266,
"rewards/margins": 1.047932505607605,
"rewards/rejected": -2.882551908493042,
"step": 1880
},
{
"epoch": 0.5381548974943052,
"grad_norm": 12.5642728805542,
"learning_rate": 1.4049413439635536e-05,
"logits/chosen": 1.4088985919952393,
"logits/rejected": 1.3726544380187988,
"logps/chosen": -211.74282836914062,
"logps/rejected": -227.03305053710938,
"loss": 0.6012,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.4925270080566406,
"rewards/margins": 1.1374258995056152,
"rewards/rejected": -3.629952907562256,
"step": 1890
},
{
"epoch": 0.541002277904328,
"grad_norm": 18.130271911621094,
"learning_rate": 1.4049145785876993e-05,
"logits/chosen": 1.833918809890747,
"logits/rejected": 1.8095060586929321,
"logps/chosen": -213.5587921142578,
"logps/rejected": -226.3291473388672,
"loss": 0.4597,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.095309019088745,
"rewards/margins": 1.4185867309570312,
"rewards/rejected": -3.5138957500457764,
"step": 1900
},
{
"epoch": 0.5438496583143508,
"grad_norm": 5.79637336730957,
"learning_rate": 1.4048878132118451e-05,
"logits/chosen": 1.5354747772216797,
"logits/rejected": 1.4952408075332642,
"logps/chosen": -218.20327758789062,
"logps/rejected": -226.9008026123047,
"loss": 0.5443,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -2.4994616508483887,
"rewards/margins": 1.0572245121002197,
"rewards/rejected": -3.5566864013671875,
"step": 1910
},
{
"epoch": 0.5466970387243736,
"grad_norm": 8.202295303344727,
"learning_rate": 1.4048610478359909e-05,
"logits/chosen": 1.4175798892974854,
"logits/rejected": 1.3886915445327759,
"logps/chosen": -214.7366485595703,
"logps/rejected": -224.1475372314453,
"loss": 0.5296,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -2.728301525115967,
"rewards/margins": 1.0051238536834717,
"rewards/rejected": -3.7334251403808594,
"step": 1920
},
{
"epoch": 0.5495444191343963,
"grad_norm": 11.456938743591309,
"learning_rate": 1.4048342824601367e-05,
"logits/chosen": 1.2038923501968384,
"logits/rejected": 1.2050249576568604,
"logps/chosen": -210.61184692382812,
"logps/rejected": -225.3157958984375,
"loss": 0.4021,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -1.8231241703033447,
"rewards/margins": 1.664263129234314,
"rewards/rejected": -3.487387180328369,
"step": 1930
},
{
"epoch": 0.5523917995444191,
"grad_norm": 7.846871852874756,
"learning_rate": 1.4048075170842824e-05,
"logits/chosen": 1.7540676593780518,
"logits/rejected": 1.7196149826049805,
"logps/chosen": -210.4138946533203,
"logps/rejected": -223.84194946289062,
"loss": 0.4072,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.289306163787842,
"rewards/margins": 1.4605536460876465,
"rewards/rejected": -3.7498602867126465,
"step": 1940
},
{
"epoch": 0.5552391799544419,
"grad_norm": 10.873760223388672,
"learning_rate": 1.4047807517084282e-05,
"logits/chosen": 1.411709189414978,
"logits/rejected": 1.394074559211731,
"logps/chosen": -211.9527587890625,
"logps/rejected": -222.88851928710938,
"loss": 0.5421,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.567718505859375,
"rewards/margins": 1.1373217105865479,
"rewards/rejected": -3.705040693283081,
"step": 1950
},
{
"epoch": 0.5580865603644647,
"grad_norm": 13.660347938537598,
"learning_rate": 1.404753986332574e-05,
"logits/chosen": 2.127122163772583,
"logits/rejected": 2.038539409637451,
"logps/chosen": -208.9009246826172,
"logps/rejected": -228.55947875976562,
"loss": 0.3832,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.088381052017212,
"rewards/margins": 1.6521527767181396,
"rewards/rejected": -3.7405338287353516,
"step": 1960
},
{
"epoch": 0.5609339407744874,
"grad_norm": 6.0093488693237305,
"learning_rate": 1.4047272209567198e-05,
"logits/chosen": 1.5527527332305908,
"logits/rejected": 1.4864261150360107,
"logps/chosen": -208.0782928466797,
"logps/rejected": -223.2481689453125,
"loss": 0.3406,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -1.4067806005477905,
"rewards/margins": 1.806133508682251,
"rewards/rejected": -3.2129147052764893,
"step": 1970
},
{
"epoch": 0.5637813211845103,
"grad_norm": 21.775585174560547,
"learning_rate": 1.4047004555808656e-05,
"logits/chosen": 1.493690848350525,
"logits/rejected": 1.4828380346298218,
"logps/chosen": -211.7380828857422,
"logps/rejected": -225.0364227294922,
"loss": 0.575,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -1.9734036922454834,
"rewards/margins": 1.117187738418579,
"rewards/rejected": -3.0905914306640625,
"step": 1980
},
{
"epoch": 0.566628701594533,
"grad_norm": 15.051896095275879,
"learning_rate": 1.4046736902050115e-05,
"logits/chosen": 1.7279115915298462,
"logits/rejected": 1.6872913837432861,
"logps/chosen": -212.10720825195312,
"logps/rejected": -215.74484252929688,
"loss": 0.6145,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.833011269569397,
"rewards/margins": 1.0812532901763916,
"rewards/rejected": -2.914264440536499,
"step": 1990
},
{
"epoch": 0.5694760820045558,
"grad_norm": 5.998073101043701,
"learning_rate": 1.4046469248291573e-05,
"logits/chosen": 1.845990777015686,
"logits/rejected": 1.782735824584961,
"logps/chosen": -199.06289672851562,
"logps/rejected": -205.4522705078125,
"loss": 0.5772,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": -0.7677577137947083,
"rewards/margins": 0.9643963575363159,
"rewards/rejected": -1.732154130935669,
"step": 2000
},
{
"epoch": 0.5723234624145785,
"grad_norm": 5.928720951080322,
"learning_rate": 1.404620159453303e-05,
"logits/chosen": 1.8845545053482056,
"logits/rejected": 1.8174835443496704,
"logps/chosen": -196.9351043701172,
"logps/rejected": -210.2369842529297,
"loss": 0.4358,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.5869276523590088,
"rewards/margins": 1.3779737949371338,
"rewards/rejected": -1.9649015665054321,
"step": 2010
},
{
"epoch": 0.5751708428246014,
"grad_norm": 4.601451396942139,
"learning_rate": 1.4045933940774487e-05,
"logits/chosen": 2.0156209468841553,
"logits/rejected": 1.9196068048477173,
"logps/chosen": -204.44776916503906,
"logps/rejected": -219.51742553710938,
"loss": 0.4502,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.471993088722229,
"rewards/margins": 1.4576828479766846,
"rewards/rejected": -2.929675579071045,
"step": 2020
},
{
"epoch": 0.5780182232346242,
"grad_norm": 9.150592803955078,
"learning_rate": 1.4045666287015946e-05,
"logits/chosen": 1.4569592475891113,
"logits/rejected": 1.42463219165802,
"logps/chosen": -213.670166015625,
"logps/rejected": -219.941162109375,
"loss": 0.6028,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -2.2914419174194336,
"rewards/margins": 0.8582653999328613,
"rewards/rejected": -3.149707555770874,
"step": 2030
},
{
"epoch": 0.5808656036446469,
"grad_norm": 12.566153526306152,
"learning_rate": 1.4045398633257404e-05,
"logits/chosen": 1.3458335399627686,
"logits/rejected": 1.3180339336395264,
"logps/chosen": -220.4646453857422,
"logps/rejected": -230.4011688232422,
"loss": 0.4585,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -2.5000290870666504,
"rewards/margins": 1.1148213148117065,
"rewards/rejected": -3.614849805831909,
"step": 2040
},
{
"epoch": 0.5837129840546698,
"grad_norm": 6.501167297363281,
"learning_rate": 1.4045130979498862e-05,
"logits/chosen": 1.350414514541626,
"logits/rejected": 1.342252254486084,
"logps/chosen": -209.9090118408203,
"logps/rejected": -220.7002716064453,
"loss": 0.4906,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6956933736801147,
"rewards/margins": 1.4066914319992065,
"rewards/rejected": -3.1023848056793213,
"step": 2050
},
{
"epoch": 0.5865603644646925,
"grad_norm": 10.026237487792969,
"learning_rate": 1.404486332574032e-05,
"logits/chosen": 1.8001444339752197,
"logits/rejected": 1.7271541357040405,
"logps/chosen": -201.92617797851562,
"logps/rejected": -216.8983917236328,
"loss": 0.4417,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -1.1111762523651123,
"rewards/margins": 1.3895812034606934,
"rewards/rejected": -2.5007576942443848,
"step": 2060
},
{
"epoch": 0.5894077448747153,
"grad_norm": 10.341662406921387,
"learning_rate": 1.4044595671981778e-05,
"logits/chosen": 1.6093097925186157,
"logits/rejected": 1.55720853805542,
"logps/chosen": -203.5904083251953,
"logps/rejected": -214.962890625,
"loss": 0.5121,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -1.2161873579025269,
"rewards/margins": 1.0402132272720337,
"rewards/rejected": -2.2564005851745605,
"step": 2070
},
{
"epoch": 0.592255125284738,
"grad_norm": 9.483428001403809,
"learning_rate": 1.4044328018223235e-05,
"logits/chosen": 1.3485796451568604,
"logits/rejected": 1.2952206134796143,
"logps/chosen": -198.46286010742188,
"logps/rejected": -209.2959442138672,
"loss": 0.4198,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.9194291830062866,
"rewards/margins": 1.4297075271606445,
"rewards/rejected": -2.3491368293762207,
"step": 2080
},
{
"epoch": 0.5951025056947609,
"grad_norm": 6.516709804534912,
"learning_rate": 1.4044060364464693e-05,
"logits/chosen": 1.5634223222732544,
"logits/rejected": 1.5188058614730835,
"logps/chosen": -199.25697326660156,
"logps/rejected": -212.009033203125,
"loss": 0.5602,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -1.1728389263153076,
"rewards/margins": 1.0664641857147217,
"rewards/rejected": -2.2393031120300293,
"step": 2090
},
{
"epoch": 0.5979498861047836,
"grad_norm": 11.579174995422363,
"learning_rate": 1.4043792710706151e-05,
"logits/chosen": 1.5612658262252808,
"logits/rejected": 1.540191650390625,
"logps/chosen": -202.50173950195312,
"logps/rejected": -211.82626342773438,
"loss": 0.6105,
"rewards/accuracies": 0.6833333969116211,
"rewards/chosen": -1.4498227834701538,
"rewards/margins": 0.8200041055679321,
"rewards/rejected": -2.269826650619507,
"step": 2100
},
{
"epoch": 0.6007972665148064,
"grad_norm": 8.937296867370605,
"learning_rate": 1.404352505694761e-05,
"logits/chosen": 1.3049190044403076,
"logits/rejected": 1.3015944957733154,
"logps/chosen": -205.02505493164062,
"logps/rejected": -214.50637817382812,
"loss": 0.5109,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -1.48309326171875,
"rewards/margins": 1.045582890510559,
"rewards/rejected": -2.5286762714385986,
"step": 2110
},
{
"epoch": 0.6036446469248291,
"grad_norm": 12.235701560974121,
"learning_rate": 1.4043257403189068e-05,
"logits/chosen": 1.2076904773712158,
"logits/rejected": 1.176274061203003,
"logps/chosen": -212.11544799804688,
"logps/rejected": -223.24087524414062,
"loss": 0.4785,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -1.900011420249939,
"rewards/margins": 1.1970151662826538,
"rewards/rejected": -3.0970263481140137,
"step": 2120
},
{
"epoch": 0.606492027334852,
"grad_norm": 4.460309982299805,
"learning_rate": 1.4042989749430524e-05,
"logits/chosen": 1.0031187534332275,
"logits/rejected": 1.0076453685760498,
"logps/chosen": -210.66738891601562,
"logps/rejected": -221.1210479736328,
"loss": 0.5552,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.1503262519836426,
"rewards/margins": 1.0407392978668213,
"rewards/rejected": -3.191065549850464,
"step": 2130
},
{
"epoch": 0.6093394077448747,
"grad_norm": 10.836349487304688,
"learning_rate": 1.4042722095671982e-05,
"logits/chosen": 1.7120803594589233,
"logits/rejected": 1.6473640203475952,
"logps/chosen": -205.09329223632812,
"logps/rejected": -219.45925903320312,
"loss": 0.43,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.839080572128296,
"rewards/margins": 1.4085099697113037,
"rewards/rejected": -3.2475905418395996,
"step": 2140
},
{
"epoch": 0.6121867881548975,
"grad_norm": 9.348235130310059,
"learning_rate": 1.404245444191344e-05,
"logits/chosen": 1.4184033870697021,
"logits/rejected": 1.4061453342437744,
"logps/chosen": -201.64305114746094,
"logps/rejected": -215.83334350585938,
"loss": 0.4861,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -1.3850140571594238,
"rewards/margins": 1.0105946063995361,
"rewards/rejected": -2.395608901977539,
"step": 2150
},
{
"epoch": 0.6150341685649203,
"grad_norm": 7.500626564025879,
"learning_rate": 1.4042186788154897e-05,
"logits/chosen": 1.6477140188217163,
"logits/rejected": 1.5993112325668335,
"logps/chosen": -207.88192749023438,
"logps/rejected": -220.2103729248047,
"loss": 0.5188,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -1.7979981899261475,
"rewards/margins": 0.9870179295539856,
"rewards/rejected": -2.7850160598754883,
"step": 2160
},
{
"epoch": 0.6178815489749431,
"grad_norm": 7.5613226890563965,
"learning_rate": 1.4041919134396355e-05,
"logits/chosen": 1.591292381286621,
"logits/rejected": 1.5465342998504639,
"logps/chosen": -211.07601928710938,
"logps/rejected": -226.35107421875,
"loss": 0.3141,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -2.4218368530273438,
"rewards/margins": 1.6328353881835938,
"rewards/rejected": -4.0546722412109375,
"step": 2170
},
{
"epoch": 0.6207289293849658,
"grad_norm": 7.95067834854126,
"learning_rate": 1.4041651480637813e-05,
"logits/chosen": 1.7995331287384033,
"logits/rejected": 1.7834396362304688,
"logps/chosen": -212.1343536376953,
"logps/rejected": -230.96517944335938,
"loss": 0.3429,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -1.9310157299041748,
"rewards/margins": 1.5808441638946533,
"rewards/rejected": -3.5118603706359863,
"step": 2180
},
{
"epoch": 0.6235763097949886,
"grad_norm": 11.615964889526367,
"learning_rate": 1.4041383826879271e-05,
"logits/chosen": 1.692636251449585,
"logits/rejected": 1.6496816873550415,
"logps/chosen": -216.45529174804688,
"logps/rejected": -233.55355834960938,
"loss": 0.4677,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -2.4663166999816895,
"rewards/margins": 1.7424099445343018,
"rewards/rejected": -4.208726406097412,
"step": 2190
},
{
"epoch": 0.6264236902050114,
"grad_norm": 16.468753814697266,
"learning_rate": 1.404111617312073e-05,
"logits/chosen": 1.7807762622833252,
"logits/rejected": 1.729928970336914,
"logps/chosen": -222.51431274414062,
"logps/rejected": -239.35311889648438,
"loss": 0.4872,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9725635051727295,
"rewards/margins": 1.4294450283050537,
"rewards/rejected": -4.402008533477783,
"step": 2200
},
{
"epoch": 0.6292710706150342,
"grad_norm": 10.072558403015137,
"learning_rate": 1.4040848519362188e-05,
"logits/chosen": 1.8452990055084229,
"logits/rejected": 1.7833513021469116,
"logps/chosen": -233.5411834716797,
"logps/rejected": -248.5119171142578,
"loss": 0.4603,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -3.804687976837158,
"rewards/margins": 1.5763683319091797,
"rewards/rejected": -5.381056785583496,
"step": 2210
},
{
"epoch": 0.6321184510250569,
"grad_norm": 19.554895401000977,
"learning_rate": 1.4040580865603646e-05,
"logits/chosen": 1.4583690166473389,
"logits/rejected": 1.3847358226776123,
"logps/chosen": -231.95474243164062,
"logps/rejected": -237.9390106201172,
"loss": 0.4337,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -4.024319648742676,
"rewards/margins": 1.3094077110290527,
"rewards/rejected": -5.3337273597717285,
"step": 2220
},
{
"epoch": 0.6349658314350797,
"grad_norm": 16.957067489624023,
"learning_rate": 1.4040313211845102e-05,
"logits/chosen": 1.5189851522445679,
"logits/rejected": 1.4547855854034424,
"logps/chosen": -220.9446563720703,
"logps/rejected": -236.49050903320312,
"loss": 0.4954,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -3.6130733489990234,
"rewards/margins": 1.352766990661621,
"rewards/rejected": -4.9658403396606445,
"step": 2230
},
{
"epoch": 0.6378132118451025,
"grad_norm": 14.116981506347656,
"learning_rate": 1.404004555808656e-05,
"logits/chosen": 1.5582940578460693,
"logits/rejected": 1.4913814067840576,
"logps/chosen": -220.11062622070312,
"logps/rejected": -236.986083984375,
"loss": 0.3916,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -2.6138153076171875,
"rewards/margins": 1.7570054531097412,
"rewards/rejected": -4.370820045471191,
"step": 2240
},
{
"epoch": 0.6406605922551253,
"grad_norm": 5.499735355377197,
"learning_rate": 1.4039777904328019e-05,
"logits/chosen": 1.5779650211334229,
"logits/rejected": 1.5341997146606445,
"logps/chosen": -215.645263671875,
"logps/rejected": -229.6122283935547,
"loss": 0.3973,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -2.5405592918395996,
"rewards/margins": 1.463417649269104,
"rewards/rejected": -4.003976821899414,
"step": 2250
},
{
"epoch": 0.643507972665148,
"grad_norm": 6.563097953796387,
"learning_rate": 1.4039510250569477e-05,
"logits/chosen": 1.6726646423339844,
"logits/rejected": 1.6540085077285767,
"logps/chosen": -224.57510375976562,
"logps/rejected": -235.57284545898438,
"loss": 0.4268,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -2.6444642543792725,
"rewards/margins": 1.399488091468811,
"rewards/rejected": -4.043952465057373,
"step": 2260
},
{
"epoch": 0.6463553530751709,
"grad_norm": 17.193113327026367,
"learning_rate": 1.4039242596810935e-05,
"logits/chosen": 1.5830856561660767,
"logits/rejected": 1.5117225646972656,
"logps/chosen": -223.4087677001953,
"logps/rejected": -238.30709838867188,
"loss": 0.5128,
"rewards/accuracies": 0.8500000834465027,
"rewards/chosen": -3.0339622497558594,
"rewards/margins": 1.8273839950561523,
"rewards/rejected": -4.861346244812012,
"step": 2270
},
{
"epoch": 0.6492027334851936,
"grad_norm": 22.565580368041992,
"learning_rate": 1.4038974943052393e-05,
"logits/chosen": 1.512303113937378,
"logits/rejected": 1.47734534740448,
"logps/chosen": -235.7754364013672,
"logps/rejected": -248.6204833984375,
"loss": 0.5663,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -4.013455390930176,
"rewards/margins": 1.3606445789337158,
"rewards/rejected": -5.3740997314453125,
"step": 2280
},
{
"epoch": 0.6520501138952164,
"grad_norm": 6.0489349365234375,
"learning_rate": 1.4038707289293851e-05,
"logits/chosen": 1.7461280822753906,
"logits/rejected": 1.7028367519378662,
"logps/chosen": -232.1387939453125,
"logps/rejected": -248.25048828125,
"loss": 0.3073,
"rewards/accuracies": 0.8999999165534973,
"rewards/chosen": -4.349621772766113,
"rewards/margins": 1.8438152074813843,
"rewards/rejected": -6.193437099456787,
"step": 2290
},
{
"epoch": 0.6548974943052391,
"grad_norm": 15.137632369995117,
"learning_rate": 1.4038439635535308e-05,
"logits/chosen": 1.4312952756881714,
"logits/rejected": 1.3589736223220825,
"logps/chosen": -237.01358032226562,
"logps/rejected": -246.2064208984375,
"loss": 0.5728,
"rewards/accuracies": 0.6999999284744263,
"rewards/chosen": -4.380759239196777,
"rewards/margins": 1.162846565246582,
"rewards/rejected": -5.543606281280518,
"step": 2300
},
{
"epoch": 0.657744874715262,
"grad_norm": 5.352278709411621,
"learning_rate": 1.4038171981776766e-05,
"logits/chosen": 1.5901074409484863,
"logits/rejected": 1.547982931137085,
"logps/chosen": -228.4188232421875,
"logps/rejected": -245.8745574951172,
"loss": 0.5632,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -4.198389053344727,
"rewards/margins": 1.272357702255249,
"rewards/rejected": -5.470747470855713,
"step": 2310
},
{
"epoch": 0.6605922551252847,
"grad_norm": 13.036628723144531,
"learning_rate": 1.4037904328018224e-05,
"logits/chosen": 1.4683014154434204,
"logits/rejected": 1.424804449081421,
"logps/chosen": -234.803955078125,
"logps/rejected": -243.31240844726562,
"loss": 0.5059,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -4.2275567054748535,
"rewards/margins": 1.2187873125076294,
"rewards/rejected": -5.446343898773193,
"step": 2320
},
{
"epoch": 0.6634396355353075,
"grad_norm": 6.533375263214111,
"learning_rate": 1.4037636674259682e-05,
"logits/chosen": 0.9792621731758118,
"logits/rejected": 0.9765647053718567,
"logps/chosen": -230.74819946289062,
"logps/rejected": -242.41702270507812,
"loss": 0.5086,
"rewards/accuracies": 0.7499999403953552,
"rewards/chosen": -4.1636061668396,
"rewards/margins": 1.1499837636947632,
"rewards/rejected": -5.313590049743652,
"step": 2330
},
{
"epoch": 0.6662870159453302,
"grad_norm": 18.044649124145508,
"learning_rate": 1.403736902050114e-05,
"logits/chosen": 1.584518551826477,
"logits/rejected": 1.54861319065094,
"logps/chosen": -239.1269073486328,
"logps/rejected": -250.5943145751953,
"loss": 0.609,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -4.921685218811035,
"rewards/margins": 1.0676127672195435,
"rewards/rejected": -5.989298343658447,
"step": 2340
},
{
"epoch": 0.6691343963553531,
"grad_norm": 19.3489990234375,
"learning_rate": 1.4037101366742597e-05,
"logits/chosen": 1.6986758708953857,
"logits/rejected": 1.633707046508789,
"logps/chosen": -240.86245727539062,
"logps/rejected": -250.063232421875,
"loss": 0.4793,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -4.555212020874023,
"rewards/margins": 1.461682915687561,
"rewards/rejected": -6.016894817352295,
"step": 2350
},
{
"epoch": 0.6719817767653758,
"grad_norm": 19.8951358795166,
"learning_rate": 1.4036833712984055e-05,
"logits/chosen": 1.5498108863830566,
"logits/rejected": 1.521090030670166,
"logps/chosen": -231.66934204101562,
"logps/rejected": -248.7901611328125,
"loss": 0.3812,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -4.384482383728027,
"rewards/margins": 1.5079156160354614,
"rewards/rejected": -5.892397880554199,
"step": 2360
},
{
"epoch": 0.6748291571753986,
"grad_norm": 5.044203758239746,
"learning_rate": 1.4036566059225512e-05,
"logits/chosen": 1.6914507150650024,
"logits/rejected": 1.6152887344360352,
"logps/chosen": -239.70755004882812,
"logps/rejected": -258.3238220214844,
"loss": 0.4626,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -5.004892826080322,
"rewards/margins": 1.4128177165985107,
"rewards/rejected": -6.417710781097412,
"step": 2370
},
{
"epoch": 0.6776765375854215,
"grad_norm": 8.82512092590332,
"learning_rate": 1.403629840546697e-05,
"logits/chosen": 1.7223188877105713,
"logits/rejected": 1.6093647480010986,
"logps/chosen": -240.8391876220703,
"logps/rejected": -253.4154815673828,
"loss": 0.4126,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -4.948991298675537,
"rewards/margins": 1.9588031768798828,
"rewards/rejected": -6.907794952392578,
"step": 2380
},
{
"epoch": 0.6805239179954442,
"grad_norm": 9.408480644226074,
"learning_rate": 1.4036030751708428e-05,
"logits/chosen": 1.3756048679351807,
"logits/rejected": 1.3040671348571777,
"logps/chosen": -231.7460479736328,
"logps/rejected": -252.6634063720703,
"loss": 0.4124,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -4.464608669281006,
"rewards/margins": 1.6818110942840576,
"rewards/rejected": -6.146419525146484,
"step": 2390
},
{
"epoch": 0.683371298405467,
"grad_norm": 14.945960998535156,
"learning_rate": 1.4035763097949886e-05,
"logits/chosen": 1.5611943006515503,
"logits/rejected": 1.513925552368164,
"logps/chosen": -234.7378387451172,
"logps/rejected": -255.2218017578125,
"loss": 0.3549,
"rewards/accuracies": 0.8500000834465027,
"rewards/chosen": -4.391298770904541,
"rewards/margins": 1.9131309986114502,
"rewards/rejected": -6.304429054260254,
"step": 2400
},
{
"epoch": 0.6862186788154897,
"grad_norm": 22.232023239135742,
"learning_rate": 1.4035495444191344e-05,
"logits/chosen": 1.219208002090454,
"logits/rejected": 1.1519418954849243,
"logps/chosen": -246.47842407226562,
"logps/rejected": -261.6808776855469,
"loss": 0.4832,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -5.468182563781738,
"rewards/margins": 1.899754285812378,
"rewards/rejected": -7.367936611175537,
"step": 2410
},
{
"epoch": 0.6890660592255126,
"grad_norm": 10.423013687133789,
"learning_rate": 1.4035227790432803e-05,
"logits/chosen": 1.3841204643249512,
"logits/rejected": 1.303436279296875,
"logps/chosen": -253.433349609375,
"logps/rejected": -266.01544189453125,
"loss": 0.4217,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -6.0178141593933105,
"rewards/margins": 1.7622039318084717,
"rewards/rejected": -7.7800188064575195,
"step": 2420
},
{
"epoch": 0.6919134396355353,
"grad_norm": 11.22570514678955,
"learning_rate": 1.403496013667426e-05,
"logits/chosen": 1.279226541519165,
"logits/rejected": 1.2061641216278076,
"logps/chosen": -245.2624969482422,
"logps/rejected": -256.43939208984375,
"loss": 0.4384,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -5.069748878479004,
"rewards/margins": 1.824032187461853,
"rewards/rejected": -6.8937811851501465,
"step": 2430
},
{
"epoch": 0.6947608200455581,
"grad_norm": 12.202534675598145,
"learning_rate": 1.4034692482915717e-05,
"logits/chosen": 1.742522954940796,
"logits/rejected": 1.6569029092788696,
"logps/chosen": -235.7698211669922,
"logps/rejected": -249.0869140625,
"loss": 0.5603,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -4.401494979858398,
"rewards/margins": 1.5202449560165405,
"rewards/rejected": -5.921740531921387,
"step": 2440
},
{
"epoch": 0.6976082004555809,
"grad_norm": 11.25021743774414,
"learning_rate": 1.4034424829157175e-05,
"logits/chosen": 1.2875077724456787,
"logits/rejected": 1.2242827415466309,
"logps/chosen": -235.0489501953125,
"logps/rejected": -250.44076538085938,
"loss": 0.4157,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -4.398201942443848,
"rewards/margins": 2.000248432159424,
"rewards/rejected": -6.398449897766113,
"step": 2450
},
{
"epoch": 0.7004555808656037,
"grad_norm": 10.962873458862305,
"learning_rate": 1.4034157175398634e-05,
"logits/chosen": 1.6589164733886719,
"logits/rejected": 1.5607925653457642,
"logps/chosen": -247.64437866210938,
"logps/rejected": -264.1566467285156,
"loss": 0.506,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -5.916208744049072,
"rewards/margins": 1.4422000646591187,
"rewards/rejected": -7.3584089279174805,
"step": 2460
},
{
"epoch": 0.7033029612756264,
"grad_norm": 6.1506500244140625,
"learning_rate": 1.4033889521640092e-05,
"logits/chosen": 1.1968879699707031,
"logits/rejected": 1.1140633821487427,
"logps/chosen": -254.21255493164062,
"logps/rejected": -268.221435546875,
"loss": 0.4051,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.287491798400879,
"rewards/margins": 1.4170840978622437,
"rewards/rejected": -7.704575538635254,
"step": 2470
},
{
"epoch": 0.7061503416856492,
"grad_norm": 7.118803024291992,
"learning_rate": 1.403362186788155e-05,
"logits/chosen": 1.3762328624725342,
"logits/rejected": 1.2869064807891846,
"logps/chosen": -240.8204803466797,
"logps/rejected": -257.37249755859375,
"loss": 0.5173,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -5.4389543533325195,
"rewards/margins": 1.7129642963409424,
"rewards/rejected": -7.151918888092041,
"step": 2480
},
{
"epoch": 0.708997722095672,
"grad_norm": 8.637585639953613,
"learning_rate": 1.4033354214123008e-05,
"logits/chosen": 1.5181769132614136,
"logits/rejected": 1.4586423635482788,
"logps/chosen": -244.4365234375,
"logps/rejected": -262.47332763671875,
"loss": 0.4751,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -5.681598663330078,
"rewards/margins": 1.891000747680664,
"rewards/rejected": -7.5725998878479,
"step": 2490
},
{
"epoch": 0.7118451025056948,
"grad_norm": 12.983312606811523,
"learning_rate": 1.4033086560364466e-05,
"logits/chosen": 1.3992502689361572,
"logits/rejected": 1.3234424591064453,
"logps/chosen": -253.96762084960938,
"logps/rejected": -270.48016357421875,
"loss": 0.4632,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -6.435060977935791,
"rewards/margins": 1.5852489471435547,
"rewards/rejected": -8.02031135559082,
"step": 2500
},
{
"epoch": 0.7146924829157175,
"grad_norm": 13.200767517089844,
"learning_rate": 1.4032818906605923e-05,
"logits/chosen": 1.6667852401733398,
"logits/rejected": 1.5995807647705078,
"logps/chosen": -254.1513671875,
"logps/rejected": -273.9045104980469,
"loss": 0.5527,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -6.633721351623535,
"rewards/margins": 1.3930259943008423,
"rewards/rejected": -8.02674674987793,
"step": 2510
},
{
"epoch": 0.7175398633257403,
"grad_norm": 20.3078670501709,
"learning_rate": 1.4032551252847381e-05,
"logits/chosen": 1.3614373207092285,
"logits/rejected": 1.3755300045013428,
"logps/chosen": -247.287353515625,
"logps/rejected": -260.49456787109375,
"loss": 0.4998,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -5.98111629486084,
"rewards/margins": 1.3161729574203491,
"rewards/rejected": -7.297289848327637,
"step": 2520
},
{
"epoch": 0.7203872437357631,
"grad_norm": 4.286506175994873,
"learning_rate": 1.4032283599088839e-05,
"logits/chosen": 1.3778407573699951,
"logits/rejected": 1.3152930736541748,
"logps/chosen": -234.5768280029297,
"logps/rejected": -255.61819458007812,
"loss": 0.372,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.508410930633545,
"rewards/margins": 1.7014334201812744,
"rewards/rejected": -6.20984411239624,
"step": 2530
},
{
"epoch": 0.7232346241457859,
"grad_norm": 14.069003105163574,
"learning_rate": 1.4032015945330297e-05,
"logits/chosen": 1.3468763828277588,
"logits/rejected": 1.233034372329712,
"logps/chosen": -233.3365478515625,
"logps/rejected": -254.7558135986328,
"loss": 0.4187,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -4.505204677581787,
"rewards/margins": 1.8832544088363647,
"rewards/rejected": -6.388458728790283,
"step": 2540
},
{
"epoch": 0.7260820045558086,
"grad_norm": 11.429390907287598,
"learning_rate": 1.4031748291571755e-05,
"logits/chosen": 1.393408179283142,
"logits/rejected": 1.2685819864273071,
"logps/chosen": -232.78707885742188,
"logps/rejected": -258.8204345703125,
"loss": 0.3655,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.111529350280762,
"rewards/margins": 1.9012877941131592,
"rewards/rejected": -6.012816429138184,
"step": 2550
},
{
"epoch": 0.7289293849658315,
"grad_norm": 7.633700847625732,
"learning_rate": 1.4031480637813214e-05,
"logits/chosen": 1.7750869989395142,
"logits/rejected": 1.6736621856689453,
"logps/chosen": -239.56918334960938,
"logps/rejected": -261.6385192871094,
"loss": 0.2991,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -4.729853630065918,
"rewards/margins": 2.528153657913208,
"rewards/rejected": -7.258008003234863,
"step": 2560
},
{
"epoch": 0.7317767653758542,
"grad_norm": 19.738534927368164,
"learning_rate": 1.403121298405467e-05,
"logits/chosen": 1.4052790403366089,
"logits/rejected": 1.363965630531311,
"logps/chosen": -255.84262084960938,
"logps/rejected": -268.30609130859375,
"loss": 0.4441,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.05529260635376,
"rewards/margins": 1.61709725856781,
"rewards/rejected": -7.672389030456543,
"step": 2570
},
{
"epoch": 0.734624145785877,
"grad_norm": 3.8141791820526123,
"learning_rate": 1.4030945330296127e-05,
"logits/chosen": 1.2736724615097046,
"logits/rejected": 1.1940056085586548,
"logps/chosen": -252.11923217773438,
"logps/rejected": -271.30157470703125,
"loss": 0.4046,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.049666881561279,
"rewards/margins": 1.935879111289978,
"rewards/rejected": -7.9855451583862305,
"step": 2580
},
{
"epoch": 0.7374715261958997,
"grad_norm": 12.9832181930542,
"learning_rate": 1.4030677676537585e-05,
"logits/chosen": 1.148601770401001,
"logits/rejected": 1.0851247310638428,
"logps/chosen": -248.23617553710938,
"logps/rejected": -267.77691650390625,
"loss": 0.3661,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -5.982962608337402,
"rewards/margins": 2.2053725719451904,
"rewards/rejected": -8.188336372375488,
"step": 2590
},
{
"epoch": 0.7403189066059226,
"grad_norm": 8.390840530395508,
"learning_rate": 1.4030410022779043e-05,
"logits/chosen": 1.5896836519241333,
"logits/rejected": 1.5164399147033691,
"logps/chosen": -248.79019165039062,
"logps/rejected": -263.56243896484375,
"loss": 0.4359,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -5.833072662353516,
"rewards/margins": 1.9023926258087158,
"rewards/rejected": -7.735465049743652,
"step": 2600
},
{
"epoch": 0.7431662870159453,
"grad_norm": 17.48469352722168,
"learning_rate": 1.4030142369020501e-05,
"logits/chosen": 1.5616130828857422,
"logits/rejected": 1.5186275243759155,
"logps/chosen": -242.703125,
"logps/rejected": -257.0491027832031,
"loss": 0.4011,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -4.932897567749023,
"rewards/margins": 2.0841403007507324,
"rewards/rejected": -7.017037868499756,
"step": 2610
},
{
"epoch": 0.7460136674259681,
"grad_norm": 7.127035140991211,
"learning_rate": 1.402987471526196e-05,
"logits/chosen": 1.4148176908493042,
"logits/rejected": 1.3808656930923462,
"logps/chosen": -248.82455444335938,
"logps/rejected": -267.6011962890625,
"loss": 0.5825,
"rewards/accuracies": 0.7833333015441895,
"rewards/chosen": -5.617693901062012,
"rewards/margins": 1.4519156217575073,
"rewards/rejected": -7.069609642028809,
"step": 2620
},
{
"epoch": 0.7488610478359908,
"grad_norm": 27.17725944519043,
"learning_rate": 1.4029607061503418e-05,
"logits/chosen": 1.1969980001449585,
"logits/rejected": 1.1847108602523804,
"logps/chosen": -261.9609375,
"logps/rejected": -281.9357604980469,
"loss": 0.4544,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.976052284240723,
"rewards/margins": 1.6174719333648682,
"rewards/rejected": -8.593523025512695,
"step": 2630
},
{
"epoch": 0.7517084282460137,
"grad_norm": 10.162541389465332,
"learning_rate": 1.4029339407744876e-05,
"logits/chosen": 1.1881685256958008,
"logits/rejected": 1.162522792816162,
"logps/chosen": -270.0924987792969,
"logps/rejected": -284.1805725097656,
"loss": 0.4265,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -7.348545074462891,
"rewards/margins": 1.6283506155014038,
"rewards/rejected": -8.976896286010742,
"step": 2640
},
{
"epoch": 0.7545558086560364,
"grad_norm": 8.748689651489258,
"learning_rate": 1.4029071753986332e-05,
"logits/chosen": 1.2383089065551758,
"logits/rejected": 1.1446388959884644,
"logps/chosen": -264.23828125,
"logps/rejected": -284.57122802734375,
"loss": 0.332,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -7.279618740081787,
"rewards/margins": 2.021749973297119,
"rewards/rejected": -9.301369667053223,
"step": 2650
},
{
"epoch": 0.7574031890660592,
"grad_norm": 11.1958646774292,
"learning_rate": 1.402880410022779e-05,
"logits/chosen": 1.0028870105743408,
"logits/rejected": 0.9154938459396362,
"logps/chosen": -249.95993041992188,
"logps/rejected": -271.75054931640625,
"loss": 0.4982,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -6.213393211364746,
"rewards/margins": 1.7268741130828857,
"rewards/rejected": -7.940268039703369,
"step": 2660
},
{
"epoch": 0.760250569476082,
"grad_norm": 15.898490905761719,
"learning_rate": 1.4028536446469249e-05,
"logits/chosen": 1.257958173751831,
"logits/rejected": 1.2141045331954956,
"logps/chosen": -255.44580078125,
"logps/rejected": -270.67926025390625,
"loss": 0.4744,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -5.937500476837158,
"rewards/margins": 1.792832612991333,
"rewards/rejected": -7.730332851409912,
"step": 2670
},
{
"epoch": 0.7630979498861048,
"grad_norm": 13.417143821716309,
"learning_rate": 1.4028268792710707e-05,
"logits/chosen": 1.6417725086212158,
"logits/rejected": 1.537362813949585,
"logps/chosen": -249.5520477294922,
"logps/rejected": -270.64495849609375,
"loss": 0.4474,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -5.665754318237305,
"rewards/margins": 1.6779617071151733,
"rewards/rejected": -7.343716621398926,
"step": 2680
},
{
"epoch": 0.7659453302961275,
"grad_norm": 19.788162231445312,
"learning_rate": 1.4028001138952165e-05,
"logits/chosen": 1.6244192123413086,
"logits/rejected": 1.5817723274230957,
"logps/chosen": -246.73605346679688,
"logps/rejected": -262.739501953125,
"loss": 0.6225,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -6.028580188751221,
"rewards/margins": 1.7873833179473877,
"rewards/rejected": -7.815962791442871,
"step": 2690
},
{
"epoch": 0.7687927107061503,
"grad_norm": 21.922740936279297,
"learning_rate": 1.4027733485193623e-05,
"logits/chosen": 1.7325636148452759,
"logits/rejected": 1.6674734354019165,
"logps/chosen": -242.2566375732422,
"logps/rejected": -272.3511657714844,
"loss": 0.3497,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -5.904580116271973,
"rewards/margins": 2.298239231109619,
"rewards/rejected": -8.20281982421875,
"step": 2700
},
{
"epoch": 0.7716400911161732,
"grad_norm": 18.379051208496094,
"learning_rate": 1.4027465831435081e-05,
"logits/chosen": 1.954079031944275,
"logits/rejected": 1.889691710472107,
"logps/chosen": -246.53463745117188,
"logps/rejected": -268.9033203125,
"loss": 0.391,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -5.3877739906311035,
"rewards/margins": 2.023282051086426,
"rewards/rejected": -7.411055564880371,
"step": 2710
},
{
"epoch": 0.7744874715261959,
"grad_norm": 14.651932716369629,
"learning_rate": 1.4027198177676538e-05,
"logits/chosen": 1.7924703359603882,
"logits/rejected": 1.7352300882339478,
"logps/chosen": -241.3613739013672,
"logps/rejected": -265.5358581542969,
"loss": 0.3971,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -5.6403656005859375,
"rewards/margins": 2.175809144973755,
"rewards/rejected": -7.816174507141113,
"step": 2720
},
{
"epoch": 0.7773348519362187,
"grad_norm": 17.90581512451172,
"learning_rate": 1.4026930523917996e-05,
"logits/chosen": 1.369199514389038,
"logits/rejected": 1.3058414459228516,
"logps/chosen": -247.9556121826172,
"logps/rejected": -272.4525146484375,
"loss": 0.4489,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -6.066657066345215,
"rewards/margins": 1.838025450706482,
"rewards/rejected": -7.904683589935303,
"step": 2730
},
{
"epoch": 0.7801822323462415,
"grad_norm": 6.542720317840576,
"learning_rate": 1.4026662870159454e-05,
"logits/chosen": 1.6855316162109375,
"logits/rejected": 1.6109323501586914,
"logps/chosen": -248.1515655517578,
"logps/rejected": -272.78765869140625,
"loss": 0.3162,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -5.281750679016113,
"rewards/margins": 2.642489194869995,
"rewards/rejected": -7.924239158630371,
"step": 2740
},
{
"epoch": 0.7830296127562643,
"grad_norm": 8.27110481262207,
"learning_rate": 1.4026395216400912e-05,
"logits/chosen": 1.410160779953003,
"logits/rejected": 1.3594660758972168,
"logps/chosen": -248.23818969726562,
"logps/rejected": -270.97955322265625,
"loss": 0.3703,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -5.9429450035095215,
"rewards/margins": 2.0173375606536865,
"rewards/rejected": -7.960282802581787,
"step": 2750
},
{
"epoch": 0.785876993166287,
"grad_norm": 12.099336624145508,
"learning_rate": 1.402612756264237e-05,
"logits/chosen": 1.7231197357177734,
"logits/rejected": 1.6259253025054932,
"logps/chosen": -254.39724731445312,
"logps/rejected": -271.6213684082031,
"loss": 0.2878,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -6.0387163162231445,
"rewards/margins": 2.486721992492676,
"rewards/rejected": -8.525439262390137,
"step": 2760
},
{
"epoch": 0.7887243735763098,
"grad_norm": 10.804024696350098,
"learning_rate": 1.4025859908883829e-05,
"logits/chosen": 1.5716185569763184,
"logits/rejected": 1.486262559890747,
"logps/chosen": -256.6431884765625,
"logps/rejected": -277.954833984375,
"loss": 0.303,
"rewards/accuracies": 0.8166667819023132,
"rewards/chosen": -6.014618873596191,
"rewards/margins": 2.517322063446045,
"rewards/rejected": -8.531940460205078,
"step": 2770
},
{
"epoch": 0.7915717539863326,
"grad_norm": 15.970914840698242,
"learning_rate": 1.4025592255125287e-05,
"logits/chosen": 1.3881986141204834,
"logits/rejected": 1.3379590511322021,
"logps/chosen": -251.85659790039062,
"logps/rejected": -281.0345153808594,
"loss": 0.3171,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -6.694669246673584,
"rewards/margins": 2.450220823287964,
"rewards/rejected": -9.144889831542969,
"step": 2780
},
{
"epoch": 0.7944191343963554,
"grad_norm": 12.897587776184082,
"learning_rate": 1.4025324601366743e-05,
"logits/chosen": 1.350629210472107,
"logits/rejected": 1.3108307123184204,
"logps/chosen": -259.5316162109375,
"logps/rejected": -283.2249450683594,
"loss": 0.2645,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -6.7262091636657715,
"rewards/margins": 2.8170058727264404,
"rewards/rejected": -9.543214797973633,
"step": 2790
},
{
"epoch": 0.7972665148063781,
"grad_norm": 17.332544326782227,
"learning_rate": 1.40250569476082e-05,
"logits/chosen": 1.5189533233642578,
"logits/rejected": 1.469429612159729,
"logps/chosen": -262.32928466796875,
"logps/rejected": -275.24981689453125,
"loss": 0.3829,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.6358160972595215,
"rewards/margins": 1.8431167602539062,
"rewards/rejected": -8.47893238067627,
"step": 2800
},
{
"epoch": 0.8001138952164009,
"grad_norm": 17.475202560424805,
"learning_rate": 1.4024789293849658e-05,
"logits/chosen": 1.844002366065979,
"logits/rejected": 1.8084990978240967,
"logps/chosen": -274.48699951171875,
"logps/rejected": -298.3126525878906,
"loss": 0.431,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -8.724340438842773,
"rewards/margins": 1.9993741512298584,
"rewards/rejected": -10.723714828491211,
"step": 2810
},
{
"epoch": 0.8029612756264237,
"grad_norm": 22.87725257873535,
"learning_rate": 1.4024521640091116e-05,
"logits/chosen": 1.6398130655288696,
"logits/rejected": 1.5759761333465576,
"logps/chosen": -276.5049743652344,
"logps/rejected": -288.7770080566406,
"loss": 0.458,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -8.648481369018555,
"rewards/margins": 1.7932255268096924,
"rewards/rejected": -10.441704750061035,
"step": 2820
},
{
"epoch": 0.8058086560364465,
"grad_norm": 10.597854614257812,
"learning_rate": 1.4024253986332574e-05,
"logits/chosen": 1.5820127725601196,
"logits/rejected": 1.5301449298858643,
"logps/chosen": -259.69732666015625,
"logps/rejected": -274.4244079589844,
"loss": 0.4166,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -6.665907859802246,
"rewards/margins": 1.9291824102401733,
"rewards/rejected": -8.59508991241455,
"step": 2830
},
{
"epoch": 0.8086560364464692,
"grad_norm": 3.9533028602600098,
"learning_rate": 1.4023986332574032e-05,
"logits/chosen": 1.4371238946914673,
"logits/rejected": 1.4022338390350342,
"logps/chosen": -239.5390625,
"logps/rejected": -267.0621337890625,
"loss": 0.4274,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -5.695757865905762,
"rewards/margins": 2.3239686489105225,
"rewards/rejected": -8.019726753234863,
"step": 2840
},
{
"epoch": 0.8115034168564921,
"grad_norm": 20.184728622436523,
"learning_rate": 1.402371867881549e-05,
"logits/chosen": 1.6135759353637695,
"logits/rejected": 1.5178974866867065,
"logps/chosen": -253.1510009765625,
"logps/rejected": -278.24407958984375,
"loss": 0.4337,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.1504926681518555,
"rewards/margins": 2.674940586090088,
"rewards/rejected": -8.825433731079102,
"step": 2850
},
{
"epoch": 0.8143507972665148,
"grad_norm": 16.252994537353516,
"learning_rate": 1.4023451025056947e-05,
"logits/chosen": 1.6190258264541626,
"logits/rejected": 1.564290165901184,
"logps/chosen": -245.3192901611328,
"logps/rejected": -275.1893615722656,
"loss": 0.2521,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.764278411865234,
"rewards/margins": 2.888000011444092,
"rewards/rejected": -8.652277946472168,
"step": 2860
},
{
"epoch": 0.8171981776765376,
"grad_norm": 15.702975273132324,
"learning_rate": 1.4023183371298405e-05,
"logits/chosen": 1.4291714429855347,
"logits/rejected": 1.3949334621429443,
"logps/chosen": -260.24554443359375,
"logps/rejected": -288.02032470703125,
"loss": 0.3742,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -7.282652378082275,
"rewards/margins": 2.00462007522583,
"rewards/rejected": -9.287271499633789,
"step": 2870
},
{
"epoch": 0.8200455580865603,
"grad_norm": 29.931583404541016,
"learning_rate": 1.4022915717539863e-05,
"logits/chosen": 1.3684179782867432,
"logits/rejected": 1.3120687007904053,
"logps/chosen": -275.12432861328125,
"logps/rejected": -295.9511413574219,
"loss": 0.5024,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -8.723713874816895,
"rewards/margins": 2.193488359451294,
"rewards/rejected": -10.917202949523926,
"step": 2880
},
{
"epoch": 0.8228929384965832,
"grad_norm": 8.566882133483887,
"learning_rate": 1.4022648063781322e-05,
"logits/chosen": 1.65011465549469,
"logits/rejected": 1.5863733291625977,
"logps/chosen": -262.7909851074219,
"logps/rejected": -287.36016845703125,
"loss": 0.4629,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -7.5483293533325195,
"rewards/margins": 2.1988370418548584,
"rewards/rejected": -9.74716567993164,
"step": 2890
},
{
"epoch": 0.8257403189066059,
"grad_norm": 9.146045684814453,
"learning_rate": 1.402238041002278e-05,
"logits/chosen": 1.4438389539718628,
"logits/rejected": 1.3922678232192993,
"logps/chosen": -256.80194091796875,
"logps/rejected": -281.6923828125,
"loss": 0.3474,
"rewards/accuracies": 0.8666667938232422,
"rewards/chosen": -7.041781425476074,
"rewards/margins": 2.2522146701812744,
"rewards/rejected": -9.293996810913086,
"step": 2900
},
{
"epoch": 0.8285876993166287,
"grad_norm": 14.651105880737305,
"learning_rate": 1.4022112756264238e-05,
"logits/chosen": 1.2979462146759033,
"logits/rejected": 1.20625638961792,
"logps/chosen": -262.59722900390625,
"logps/rejected": -282.945068359375,
"loss": 0.5572,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -7.369248390197754,
"rewards/margins": 1.7789223194122314,
"rewards/rejected": -9.14816951751709,
"step": 2910
},
{
"epoch": 0.8314350797266514,
"grad_norm": 18.253070831298828,
"learning_rate": 1.4021845102505696e-05,
"logits/chosen": 1.4090951681137085,
"logits/rejected": 1.3580656051635742,
"logps/chosen": -260.5758056640625,
"logps/rejected": -275.6468505859375,
"loss": 0.6057,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -7.077749729156494,
"rewards/margins": 1.6264019012451172,
"rewards/rejected": -8.704151153564453,
"step": 2920
},
{
"epoch": 0.8342824601366743,
"grad_norm": 29.435651779174805,
"learning_rate": 1.4021577448747153e-05,
"logits/chosen": 1.1056894063949585,
"logits/rejected": 1.0585362911224365,
"logps/chosen": -262.055419921875,
"logps/rejected": -278.28082275390625,
"loss": 0.4375,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -7.14243221282959,
"rewards/margins": 1.8324248790740967,
"rewards/rejected": -8.974858283996582,
"step": 2930
},
{
"epoch": 0.837129840546697,
"grad_norm": 23.39213752746582,
"learning_rate": 1.402130979498861e-05,
"logits/chosen": 1.1425232887268066,
"logits/rejected": 1.1206319332122803,
"logps/chosen": -254.543212890625,
"logps/rejected": -272.4224548339844,
"loss": 0.5169,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -6.688830375671387,
"rewards/margins": 1.7970342636108398,
"rewards/rejected": -8.485864639282227,
"step": 2940
},
{
"epoch": 0.8399772209567198,
"grad_norm": 7.043151378631592,
"learning_rate": 1.4021042141230069e-05,
"logits/chosen": 1.5714797973632812,
"logits/rejected": 1.5142377614974976,
"logps/chosen": -242.92837524414062,
"logps/rejected": -277.87359619140625,
"loss": 0.4165,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -6.007413864135742,
"rewards/margins": 2.4879133701324463,
"rewards/rejected": -8.495327949523926,
"step": 2950
},
{
"epoch": 0.8428246013667426,
"grad_norm": 12.184271812438965,
"learning_rate": 1.4020774487471527e-05,
"logits/chosen": 1.125959038734436,
"logits/rejected": 1.0558584928512573,
"logps/chosen": -243.2732696533203,
"logps/rejected": -271.07122802734375,
"loss": 0.2672,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.250051021575928,
"rewards/margins": 2.6489367485046387,
"rewards/rejected": -7.898987770080566,
"step": 2960
},
{
"epoch": 0.8456719817767654,
"grad_norm": 3.8234286308288574,
"learning_rate": 1.4020506833712985e-05,
"logits/chosen": 1.368369460105896,
"logits/rejected": 1.295506477355957,
"logps/chosen": -246.85592651367188,
"logps/rejected": -274.9765319824219,
"loss": 0.2654,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -5.611214637756348,
"rewards/margins": 2.9931886196136475,
"rewards/rejected": -8.604402542114258,
"step": 2970
},
{
"epoch": 0.8485193621867881,
"grad_norm": 28.678070068359375,
"learning_rate": 1.4020239179954443e-05,
"logits/chosen": 1.4959561824798584,
"logits/rejected": 1.4261696338653564,
"logps/chosen": -254.52072143554688,
"logps/rejected": -277.2051696777344,
"loss": 0.3827,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -6.094107627868652,
"rewards/margins": 2.3713717460632324,
"rewards/rejected": -8.465478897094727,
"step": 2980
},
{
"epoch": 0.8513667425968109,
"grad_norm": 30.86003303527832,
"learning_rate": 1.4019971526195902e-05,
"logits/chosen": 1.6645902395248413,
"logits/rejected": 1.5708162784576416,
"logps/chosen": -251.2325897216797,
"logps/rejected": -284.61077880859375,
"loss": 0.3969,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.4030866622924805,
"rewards/margins": 3.128662586212158,
"rewards/rejected": -9.53174877166748,
"step": 2990
},
{
"epoch": 0.8542141230068337,
"grad_norm": 24.597213745117188,
"learning_rate": 1.4019703872437358e-05,
"logits/chosen": 1.1054723262786865,
"logits/rejected": 1.0511767864227295,
"logps/chosen": -255.37075805664062,
"logps/rejected": -279.79388427734375,
"loss": 0.5802,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.036796569824219,
"rewards/margins": 2.4329230785369873,
"rewards/rejected": -8.469719886779785,
"step": 3000
},
{
"epoch": 0.8570615034168565,
"grad_norm": 7.576244354248047,
"learning_rate": 1.4019436218678816e-05,
"logits/chosen": 0.9394910931587219,
"logits/rejected": 0.9130109548568726,
"logps/chosen": -260.2972106933594,
"logps/rejected": -278.38916015625,
"loss": 0.4125,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -7.257266998291016,
"rewards/margins": 2.3638997077941895,
"rewards/rejected": -9.621164321899414,
"step": 3010
},
{
"epoch": 0.8599088838268792,
"grad_norm": 19.788061141967773,
"learning_rate": 1.4019168564920273e-05,
"logits/chosen": 0.8776735067367554,
"logits/rejected": 0.8553932905197144,
"logps/chosen": -258.1982421875,
"logps/rejected": -288.4192810058594,
"loss": 0.2575,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -6.4132399559021,
"rewards/margins": 2.6626341342926025,
"rewards/rejected": -9.075874328613281,
"step": 3020
},
{
"epoch": 0.8627562642369021,
"grad_norm": 7.41077995300293,
"learning_rate": 1.4018900911161731e-05,
"logits/chosen": 0.8213122487068176,
"logits/rejected": 0.7751299142837524,
"logps/chosen": -255.14517211914062,
"logps/rejected": -280.5233459472656,
"loss": 0.3752,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -6.201568603515625,
"rewards/margins": 2.61332368850708,
"rewards/rejected": -8.814892768859863,
"step": 3030
},
{
"epoch": 0.8656036446469249,
"grad_norm": 5.560190200805664,
"learning_rate": 1.4018633257403189e-05,
"logits/chosen": 1.249352216720581,
"logits/rejected": 1.222125768661499,
"logps/chosen": -249.1427764892578,
"logps/rejected": -272.9322814941406,
"loss": 0.492,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -5.985461711883545,
"rewards/margins": 2.357642650604248,
"rewards/rejected": -8.343104362487793,
"step": 3040
},
{
"epoch": 0.8684510250569476,
"grad_norm": 17.846477508544922,
"learning_rate": 1.4018365603644647e-05,
"logits/chosen": 1.1001662015914917,
"logits/rejected": 1.073899745941162,
"logps/chosen": -235.15127563476562,
"logps/rejected": -256.9810485839844,
"loss": 0.5462,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.736093521118164,
"rewards/margins": 1.6902631521224976,
"rewards/rejected": -6.426356315612793,
"step": 3050
},
{
"epoch": 0.8712984054669703,
"grad_norm": 6.344593048095703,
"learning_rate": 1.4018097949886105e-05,
"logits/chosen": 1.6326649188995361,
"logits/rejected": 1.630692481994629,
"logps/chosen": -231.1848602294922,
"logps/rejected": -251.69509887695312,
"loss": 0.4002,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -4.022892951965332,
"rewards/margins": 1.9389598369598389,
"rewards/rejected": -5.961852073669434,
"step": 3060
},
{
"epoch": 0.8741457858769932,
"grad_norm": 25.729211807250977,
"learning_rate": 1.4017830296127562e-05,
"logits/chosen": 1.8420072793960571,
"logits/rejected": 1.8046996593475342,
"logps/chosen": -239.9873504638672,
"logps/rejected": -251.55722045898438,
"loss": 0.6297,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -4.679580211639404,
"rewards/margins": 1.3005344867706299,
"rewards/rejected": -5.980114936828613,
"step": 3070
},
{
"epoch": 0.876993166287016,
"grad_norm": 17.203083038330078,
"learning_rate": 1.401756264236902e-05,
"logits/chosen": 1.9162012338638306,
"logits/rejected": 1.8495725393295288,
"logps/chosen": -236.2726593017578,
"logps/rejected": -247.9042510986328,
"loss": 0.5307,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -4.582438945770264,
"rewards/margins": 1.3650325536727905,
"rewards/rejected": -5.947472095489502,
"step": 3080
},
{
"epoch": 0.8798405466970387,
"grad_norm": 14.923961639404297,
"learning_rate": 1.4017294988610478e-05,
"logits/chosen": 1.6813533306121826,
"logits/rejected": 1.647003412246704,
"logps/chosen": -234.11679077148438,
"logps/rejected": -252.88388061523438,
"loss": 0.4419,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -4.13940954208374,
"rewards/margins": 1.7298119068145752,
"rewards/rejected": -5.8692216873168945,
"step": 3090
},
{
"epoch": 0.8826879271070615,
"grad_norm": 11.808030128479004,
"learning_rate": 1.4017027334851936e-05,
"logits/chosen": 1.5109126567840576,
"logits/rejected": 1.4859440326690674,
"logps/chosen": -233.6483612060547,
"logps/rejected": -249.63638305664062,
"loss": 0.5049,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -4.1950249671936035,
"rewards/margins": 1.3217439651489258,
"rewards/rejected": -5.516768455505371,
"step": 3100
},
{
"epoch": 0.8855353075170843,
"grad_norm": 4.284244060516357,
"learning_rate": 1.4016759681093395e-05,
"logits/chosen": 2.059269905090332,
"logits/rejected": 1.9955742359161377,
"logps/chosen": -233.57199096679688,
"logps/rejected": -248.676025390625,
"loss": 0.4276,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -3.8850066661834717,
"rewards/margins": 1.9384835958480835,
"rewards/rejected": -5.823491096496582,
"step": 3110
},
{
"epoch": 0.8883826879271071,
"grad_norm": 32.62685012817383,
"learning_rate": 1.4016492027334853e-05,
"logits/chosen": 1.6952826976776123,
"logits/rejected": 1.7041078805923462,
"logps/chosen": -234.2545928955078,
"logps/rejected": -243.2709197998047,
"loss": 0.5903,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -4.209284782409668,
"rewards/margins": 1.3029229640960693,
"rewards/rejected": -5.512207984924316,
"step": 3120
},
{
"epoch": 0.8912300683371298,
"grad_norm": 17.5522518157959,
"learning_rate": 1.4016224373576311e-05,
"logits/chosen": 1.4055092334747314,
"logits/rejected": 1.3894739151000977,
"logps/chosen": -235.7975311279297,
"logps/rejected": -243.79611206054688,
"loss": 0.4752,
"rewards/accuracies": 0.7833333015441895,
"rewards/chosen": -4.342627048492432,
"rewards/margins": 1.4617254734039307,
"rewards/rejected": -5.804352283477783,
"step": 3130
},
{
"epoch": 0.8940774487471527,
"grad_norm": 11.302553176879883,
"learning_rate": 1.4015956719817767e-05,
"logits/chosen": 1.6838788986206055,
"logits/rejected": 1.6712299585342407,
"logps/chosen": -235.022705078125,
"logps/rejected": -242.8223114013672,
"loss": 0.4063,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -4.4199113845825195,
"rewards/margins": 1.4390454292297363,
"rewards/rejected": -5.858956813812256,
"step": 3140
},
{
"epoch": 0.8969248291571754,
"grad_norm": 11.453242301940918,
"learning_rate": 1.4015689066059226e-05,
"logits/chosen": 1.2508509159088135,
"logits/rejected": 1.2264854907989502,
"logps/chosen": -233.52792358398438,
"logps/rejected": -245.04104614257812,
"loss": 0.4138,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -4.280221462249756,
"rewards/margins": 1.6082589626312256,
"rewards/rejected": -5.888480186462402,
"step": 3150
},
{
"epoch": 0.8997722095671982,
"grad_norm": 5.725339412689209,
"learning_rate": 1.4015421412300684e-05,
"logits/chosen": 1.3065173625946045,
"logits/rejected": 1.2575079202651978,
"logps/chosen": -235.4761505126953,
"logps/rejected": -260.0916442871094,
"loss": 0.3539,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -4.3087477684021,
"rewards/margins": 2.0462512969970703,
"rewards/rejected": -6.35499906539917,
"step": 3160
},
{
"epoch": 0.9026195899772209,
"grad_norm": 14.741144180297852,
"learning_rate": 1.4015153758542142e-05,
"logits/chosen": 1.2061371803283691,
"logits/rejected": 1.1723283529281616,
"logps/chosen": -246.95114135742188,
"logps/rejected": -266.38836669921875,
"loss": 0.4665,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -5.428233623504639,
"rewards/margins": 1.7254750728607178,
"rewards/rejected": -7.153708457946777,
"step": 3170
},
{
"epoch": 0.9054669703872438,
"grad_norm": 22.822391510009766,
"learning_rate": 1.40148861047836e-05,
"logits/chosen": 0.7415103912353516,
"logits/rejected": 0.7030202746391296,
"logps/chosen": -256.55096435546875,
"logps/rejected": -277.3672180175781,
"loss": 0.3735,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.38254976272583,
"rewards/margins": 2.4899661540985107,
"rewards/rejected": -8.872515678405762,
"step": 3180
},
{
"epoch": 0.9083143507972665,
"grad_norm": 23.086688995361328,
"learning_rate": 1.4014618451025058e-05,
"logits/chosen": 1.1446452140808105,
"logits/rejected": 1.0936695337295532,
"logps/chosen": -264.65997314453125,
"logps/rejected": -285.80548095703125,
"loss": 0.3886,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -7.392777919769287,
"rewards/margins": 2.2488951683044434,
"rewards/rejected": -9.64167308807373,
"step": 3190
},
{
"epoch": 0.9111617312072893,
"grad_norm": 12.26983642578125,
"learning_rate": 1.4014350797266517e-05,
"logits/chosen": 0.9253866076469421,
"logits/rejected": 0.867773175239563,
"logps/chosen": -269.03515625,
"logps/rejected": -294.5494079589844,
"loss": 0.3786,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.011452674865723,
"rewards/margins": 2.745143175125122,
"rewards/rejected": -10.756595611572266,
"step": 3200
},
{
"epoch": 0.914009111617312,
"grad_norm": 29.206724166870117,
"learning_rate": 1.4014083143507973e-05,
"logits/chosen": 1.3603354692459106,
"logits/rejected": 1.3081294298171997,
"logps/chosen": -271.06903076171875,
"logps/rejected": -285.3981018066406,
"loss": 0.5639,
"rewards/accuracies": 0.75,
"rewards/chosen": -7.847748756408691,
"rewards/margins": 1.758512258529663,
"rewards/rejected": -9.606261253356934,
"step": 3210
},
{
"epoch": 0.9168564920273349,
"grad_norm": 18.595735549926758,
"learning_rate": 1.4013815489749431e-05,
"logits/chosen": 1.1783987283706665,
"logits/rejected": 1.1358586549758911,
"logps/chosen": -274.8568420410156,
"logps/rejected": -301.0245056152344,
"loss": 0.3253,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -8.224080085754395,
"rewards/margins": 2.375084638595581,
"rewards/rejected": -10.599164962768555,
"step": 3220
},
{
"epoch": 0.9197038724373576,
"grad_norm": 2.1214218139648438,
"learning_rate": 1.401354783599089e-05,
"logits/chosen": 0.9433174133300781,
"logits/rejected": 0.8605527877807617,
"logps/chosen": -273.72625732421875,
"logps/rejected": -299.55023193359375,
"loss": 0.4735,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -8.827226638793945,
"rewards/margins": 2.468843460083008,
"rewards/rejected": -11.296069145202637,
"step": 3230
},
{
"epoch": 0.9225512528473804,
"grad_norm": 21.682979583740234,
"learning_rate": 1.4013280182232348e-05,
"logits/chosen": 1.137378215789795,
"logits/rejected": 1.0911321640014648,
"logps/chosen": -276.5653076171875,
"logps/rejected": -296.9649963378906,
"loss": 0.5877,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -8.806356430053711,
"rewards/margins": 1.9599056243896484,
"rewards/rejected": -10.766261100769043,
"step": 3240
},
{
"epoch": 0.9253986332574032,
"grad_norm": 16.58523941040039,
"learning_rate": 1.4013012528473804e-05,
"logits/chosen": 1.2076373100280762,
"logits/rejected": 1.1502797603607178,
"logps/chosen": -262.76336669921875,
"logps/rejected": -297.0911560058594,
"loss": 0.3606,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -7.569868564605713,
"rewards/margins": 2.8257970809936523,
"rewards/rejected": -10.39566421508789,
"step": 3250
},
{
"epoch": 0.928246013667426,
"grad_norm": 32.41596984863281,
"learning_rate": 1.4012744874715262e-05,
"logits/chosen": 1.0952074527740479,
"logits/rejected": 1.0772335529327393,
"logps/chosen": -271.6856994628906,
"logps/rejected": -288.0881042480469,
"loss": 0.5652,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -7.678606986999512,
"rewards/margins": 1.8118677139282227,
"rewards/rejected": -9.49047565460205,
"step": 3260
},
{
"epoch": 0.9310933940774487,
"grad_norm": 16.00614356994629,
"learning_rate": 1.401247722095672e-05,
"logits/chosen": 1.1436659097671509,
"logits/rejected": 1.1272941827774048,
"logps/chosen": -257.2648010253906,
"logps/rejected": -267.32452392578125,
"loss": 0.567,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -6.478804588317871,
"rewards/margins": 1.3715341091156006,
"rewards/rejected": -7.850338935852051,
"step": 3270
},
{
"epoch": 0.9339407744874715,
"grad_norm": 12.072650909423828,
"learning_rate": 1.4012209567198177e-05,
"logits/chosen": 0.7003241777420044,
"logits/rejected": 0.671663761138916,
"logps/chosen": -251.978515625,
"logps/rejected": -272.39129638671875,
"loss": 0.3737,
"rewards/accuracies": 0.8166667819023132,
"rewards/chosen": -6.3597917556762695,
"rewards/margins": 2.238091230392456,
"rewards/rejected": -8.597883224487305,
"step": 3280
},
{
"epoch": 0.9367881548974943,
"grad_norm": 17.540746688842773,
"learning_rate": 1.4011941913439635e-05,
"logits/chosen": 1.0943725109100342,
"logits/rejected": 1.0946639776229858,
"logps/chosen": -257.8005676269531,
"logps/rejected": -275.43048095703125,
"loss": 0.7236,
"rewards/accuracies": 0.6666666269302368,
"rewards/chosen": -6.777687072753906,
"rewards/margins": 1.4592044353485107,
"rewards/rejected": -8.236891746520996,
"step": 3290
},
{
"epoch": 0.9396355353075171,
"grad_norm": 20.523656845092773,
"learning_rate": 1.4011674259681093e-05,
"logits/chosen": 0.46389874815940857,
"logits/rejected": 0.4536392092704773,
"logps/chosen": -268.06927490234375,
"logps/rejected": -283.0080871582031,
"loss": 0.5618,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -7.472816467285156,
"rewards/margins": 1.5584744215011597,
"rewards/rejected": -9.031290054321289,
"step": 3300
},
{
"epoch": 0.9424829157175398,
"grad_norm": 13.506325721740723,
"learning_rate": 1.4011406605922551e-05,
"logits/chosen": 0.7252389192581177,
"logits/rejected": 0.7051321864128113,
"logps/chosen": -266.4188537597656,
"logps/rejected": -288.6436767578125,
"loss": 0.3219,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -7.288155555725098,
"rewards/margins": 2.1877682209014893,
"rewards/rejected": -9.475923538208008,
"step": 3310
},
{
"epoch": 0.9453302961275627,
"grad_norm": 14.530557632446289,
"learning_rate": 1.401113895216401e-05,
"logits/chosen": 0.7469380497932434,
"logits/rejected": 0.7100605964660645,
"logps/chosen": -264.8739318847656,
"logps/rejected": -279.9246520996094,
"loss": 0.3682,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -7.130241394042969,
"rewards/margins": 1.9568824768066406,
"rewards/rejected": -9.08712387084961,
"step": 3320
},
{
"epoch": 0.9481776765375854,
"grad_norm": 3.6131389141082764,
"learning_rate": 1.4010871298405468e-05,
"logits/chosen": 0.8936527967453003,
"logits/rejected": 0.8348100781440735,
"logps/chosen": -261.61505126953125,
"logps/rejected": -276.36517333984375,
"loss": 0.4312,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -6.888121128082275,
"rewards/margins": 1.6904165744781494,
"rewards/rejected": -8.578537940979004,
"step": 3330
},
{
"epoch": 0.9510250569476082,
"grad_norm": 14.113249778747559,
"learning_rate": 1.4010603644646926e-05,
"logits/chosen": 0.9257510900497437,
"logits/rejected": 0.8715246319770813,
"logps/chosen": -259.8716735839844,
"logps/rejected": -280.5459899902344,
"loss": 0.446,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -6.530680179595947,
"rewards/margins": 2.3227107524871826,
"rewards/rejected": -8.85339069366455,
"step": 3340
},
{
"epoch": 0.9538724373576309,
"grad_norm": 23.52164649963379,
"learning_rate": 1.4010335990888382e-05,
"logits/chosen": 0.7195979356765747,
"logits/rejected": 0.6487305164337158,
"logps/chosen": -269.1336975097656,
"logps/rejected": -293.1927185058594,
"loss": 0.3455,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -7.874746799468994,
"rewards/margins": 2.290498971939087,
"rewards/rejected": -10.165246963500977,
"step": 3350
},
{
"epoch": 0.9567198177676538,
"grad_norm": 21.7740478515625,
"learning_rate": 1.401006833712984e-05,
"logits/chosen": 0.9675396084785461,
"logits/rejected": 0.9502711296081543,
"logps/chosen": -280.28265380859375,
"logps/rejected": -295.63922119140625,
"loss": 0.5749,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -8.650227546691895,
"rewards/margins": 1.562798261642456,
"rewards/rejected": -10.21302604675293,
"step": 3360
},
{
"epoch": 0.9595671981776766,
"grad_norm": 7.417437553405762,
"learning_rate": 1.4009800683371299e-05,
"logits/chosen": 0.8215571641921997,
"logits/rejected": 0.8030030131340027,
"logps/chosen": -280.65167236328125,
"logps/rejected": -294.6379699707031,
"loss": 0.5076,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -8.616076469421387,
"rewards/margins": 1.911773443222046,
"rewards/rejected": -10.527849197387695,
"step": 3370
},
{
"epoch": 0.9624145785876993,
"grad_norm": 6.399313449859619,
"learning_rate": 1.4009533029612757e-05,
"logits/chosen": 0.8853170275688171,
"logits/rejected": 0.8363176584243774,
"logps/chosen": -271.77557373046875,
"logps/rejected": -293.91229248046875,
"loss": 0.2771,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -8.413153648376465,
"rewards/margins": 2.4288392066955566,
"rewards/rejected": -10.84199333190918,
"step": 3380
},
{
"epoch": 0.965261958997722,
"grad_norm": 21.55265998840332,
"learning_rate": 1.4009265375854215e-05,
"logits/chosen": 0.755032479763031,
"logits/rejected": 0.6583060622215271,
"logps/chosen": -271.9349365234375,
"logps/rejected": -301.05926513671875,
"loss": 0.4095,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.547085762023926,
"rewards/margins": 2.608051300048828,
"rewards/rejected": -11.155137062072754,
"step": 3390
},
{
"epoch": 0.9681093394077449,
"grad_norm": 22.12690544128418,
"learning_rate": 1.4008997722095673e-05,
"logits/chosen": 0.648233950138092,
"logits/rejected": 0.6009319424629211,
"logps/chosen": -268.60980224609375,
"logps/rejected": -287.47772216796875,
"loss": 0.4599,
"rewards/accuracies": 0.8166666030883789,
"rewards/chosen": -7.8952531814575195,
"rewards/margins": 2.053194522857666,
"rewards/rejected": -9.948448181152344,
"step": 3400
},
{
"epoch": 0.9709567198177677,
"grad_norm": 5.293265342712402,
"learning_rate": 1.4008730068337131e-05,
"logits/chosen": 0.7652915120124817,
"logits/rejected": 0.7024304866790771,
"logps/chosen": -272.99755859375,
"logps/rejected": -295.06463623046875,
"loss": 0.4947,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -7.992232322692871,
"rewards/margins": 2.4706037044525146,
"rewards/rejected": -10.462837219238281,
"step": 3410
},
{
"epoch": 0.9738041002277904,
"grad_norm": 19.21979331970215,
"learning_rate": 1.4008462414578588e-05,
"logits/chosen": 0.7430532574653625,
"logits/rejected": 0.6697141528129578,
"logps/chosen": -268.928955078125,
"logps/rejected": -291.8720703125,
"loss": 0.316,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -8.033040046691895,
"rewards/margins": 2.5782742500305176,
"rewards/rejected": -10.61131477355957,
"step": 3420
},
{
"epoch": 0.9766514806378133,
"grad_norm": 13.320822715759277,
"learning_rate": 1.4008194760820046e-05,
"logits/chosen": 0.8317564129829407,
"logits/rejected": 0.7908271551132202,
"logps/chosen": -268.9102783203125,
"logps/rejected": -295.58349609375,
"loss": 0.4192,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -7.700282096862793,
"rewards/margins": 2.5946054458618164,
"rewards/rejected": -10.294888496398926,
"step": 3430
},
{
"epoch": 0.979498861047836,
"grad_norm": 9.567367553710938,
"learning_rate": 1.4007927107061504e-05,
"logits/chosen": 0.8482456207275391,
"logits/rejected": 0.8254700899124146,
"logps/chosen": -258.6345520019531,
"logps/rejected": -284.7102355957031,
"loss": 0.3652,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -6.666827201843262,
"rewards/margins": 2.667280673980713,
"rewards/rejected": -9.334108352661133,
"step": 3440
},
{
"epoch": 0.9823462414578588,
"grad_norm": 10.789649963378906,
"learning_rate": 1.4007659453302962e-05,
"logits/chosen": 0.5343358516693115,
"logits/rejected": 0.5035391449928284,
"logps/chosen": -258.1282043457031,
"logps/rejected": -283.01202392578125,
"loss": 0.2902,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -6.793849945068359,
"rewards/margins": 2.455238103866577,
"rewards/rejected": -9.249088287353516,
"step": 3450
},
{
"epoch": 0.9851936218678815,
"grad_norm": 14.10362720489502,
"learning_rate": 1.400739179954442e-05,
"logits/chosen": 0.3702552020549774,
"logits/rejected": 0.32574790716171265,
"logps/chosen": -262.18408203125,
"logps/rejected": -284.9990234375,
"loss": 0.457,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -7.336909294128418,
"rewards/margins": 2.0833096504211426,
"rewards/rejected": -9.420219421386719,
"step": 3460
},
{
"epoch": 0.9880410022779044,
"grad_norm": 5.914283275604248,
"learning_rate": 1.4007124145785877e-05,
"logits/chosen": 0.540917158126831,
"logits/rejected": 0.5174443125724792,
"logps/chosen": -251.36801147460938,
"logps/rejected": -273.43450927734375,
"loss": 0.2641,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -6.19476318359375,
"rewards/margins": 2.6106886863708496,
"rewards/rejected": -8.805452346801758,
"step": 3470
},
{
"epoch": 0.9908883826879271,
"grad_norm": 7.198950290679932,
"learning_rate": 1.4006856492027335e-05,
"logits/chosen": 0.8166311383247375,
"logits/rejected": 0.7666522264480591,
"logps/chosen": -246.75436401367188,
"logps/rejected": -270.91619873046875,
"loss": 0.2759,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.385769844055176,
"rewards/margins": 2.8532614707946777,
"rewards/rejected": -8.239030838012695,
"step": 3480
},
{
"epoch": 0.9937357630979499,
"grad_norm": 16.072847366333008,
"learning_rate": 1.4006588838268792e-05,
"logits/chosen": 0.6863161325454712,
"logits/rejected": 0.6740552186965942,
"logps/chosen": -248.19387817382812,
"logps/rejected": -265.17889404296875,
"loss": 0.5973,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -5.686077117919922,
"rewards/margins": 1.7341415882110596,
"rewards/rejected": -7.420218467712402,
"step": 3490
},
{
"epoch": 0.9965831435079726,
"grad_norm": 13.368264198303223,
"learning_rate": 1.400632118451025e-05,
"logits/chosen": 0.6482622623443604,
"logits/rejected": 0.5943307876586914,
"logps/chosen": -249.43032836914062,
"logps/rejected": -268.16607666015625,
"loss": 0.4939,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -5.8397626876831055,
"rewards/margins": 2.000570058822632,
"rewards/rejected": -7.840332984924316,
"step": 3500
},
{
"epoch": 0.9994305239179955,
"grad_norm": 16.661462783813477,
"learning_rate": 1.4006053530751708e-05,
"logits/chosen": 0.16263927519321442,
"logits/rejected": 0.1554526388645172,
"logps/chosen": -251.5836639404297,
"logps/rejected": -271.8070068359375,
"loss": 0.3875,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -6.0699052810668945,
"rewards/margins": 1.8712621927261353,
"rewards/rejected": -7.94116735458374,
"step": 3510
},
{
"epoch": 1.0022779043280183,
"grad_norm": 10.85046100616455,
"learning_rate": 1.4005785876993166e-05,
"logits/chosen": 0.3554074764251709,
"logits/rejected": 0.34798485040664673,
"logps/chosen": -253.5457000732422,
"logps/rejected": -276.7463684082031,
"loss": 0.394,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -6.179994583129883,
"rewards/margins": 2.402238130569458,
"rewards/rejected": -8.582232475280762,
"step": 3520
},
{
"epoch": 1.005125284738041,
"grad_norm": 21.960153579711914,
"learning_rate": 1.4005518223234624e-05,
"logits/chosen": 0.39422959089279175,
"logits/rejected": 0.35846349596977234,
"logps/chosen": -263.34832763671875,
"logps/rejected": -285.46929931640625,
"loss": 0.4013,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -7.29735803604126,
"rewards/margins": 2.383136510848999,
"rewards/rejected": -9.680493354797363,
"step": 3530
},
{
"epoch": 1.0079726651480638,
"grad_norm": 6.245369911193848,
"learning_rate": 1.4005250569476083e-05,
"logits/chosen": 0.5632292628288269,
"logits/rejected": 0.502747654914856,
"logps/chosen": -268.88616943359375,
"logps/rejected": -290.48797607421875,
"loss": 0.3698,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -7.674142360687256,
"rewards/margins": 2.4379868507385254,
"rewards/rejected": -10.112129211425781,
"step": 3540
},
{
"epoch": 1.0108200455580865,
"grad_norm": 11.277658462524414,
"learning_rate": 1.400498291571754e-05,
"logits/chosen": 0.5037197470664978,
"logits/rejected": 0.47341617941856384,
"logps/chosen": -269.2908630371094,
"logps/rejected": -290.01025390625,
"loss": 0.5014,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -7.865833282470703,
"rewards/margins": 2.0145423412323,
"rewards/rejected": -9.880374908447266,
"step": 3550
},
{
"epoch": 1.0136674259681093,
"grad_norm": 17.50825309753418,
"learning_rate": 1.4004715261958999e-05,
"logits/chosen": 0.2622000575065613,
"logits/rejected": 0.22665706276893616,
"logps/chosen": -268.0179443359375,
"logps/rejected": -284.2073059082031,
"loss": 0.3836,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -7.505197048187256,
"rewards/margins": 2.5877254009246826,
"rewards/rejected": -10.092923164367676,
"step": 3560
},
{
"epoch": 1.0165148063781322,
"grad_norm": 11.33314323425293,
"learning_rate": 1.4004447608200455e-05,
"logits/chosen": 0.3246827721595764,
"logits/rejected": 0.3127135634422302,
"logps/chosen": -256.9268798828125,
"logps/rejected": -285.8166809082031,
"loss": 0.2469,
"rewards/accuracies": 0.9333332777023315,
"rewards/chosen": -6.845952033996582,
"rewards/margins": 2.841376543045044,
"rewards/rejected": -9.687329292297363,
"step": 3570
},
{
"epoch": 1.0193621867881548,
"grad_norm": 15.446540832519531,
"learning_rate": 1.4004179954441914e-05,
"logits/chosen": 0.36501818895339966,
"logits/rejected": 0.31509679555892944,
"logps/chosen": -260.56707763671875,
"logps/rejected": -287.8186950683594,
"loss": 0.2903,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -6.984988212585449,
"rewards/margins": 2.249208927154541,
"rewards/rejected": -9.234196662902832,
"step": 3580
},
{
"epoch": 1.0222095671981777,
"grad_norm": 13.437565803527832,
"learning_rate": 1.4003912300683372e-05,
"logits/chosen": 0.45073944330215454,
"logits/rejected": 0.402204692363739,
"logps/chosen": -257.2052307128906,
"logps/rejected": -281.25775146484375,
"loss": 0.4182,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -6.852511405944824,
"rewards/margins": 2.6151347160339355,
"rewards/rejected": -9.467647552490234,
"step": 3590
},
{
"epoch": 1.0250569476082005,
"grad_norm": 8.935168266296387,
"learning_rate": 1.400364464692483e-05,
"logits/chosen": 0.6178187131881714,
"logits/rejected": 0.5572710633277893,
"logps/chosen": -266.0174560546875,
"logps/rejected": -285.91973876953125,
"loss": 0.3928,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -7.804986000061035,
"rewards/margins": 2.040278196334839,
"rewards/rejected": -9.845263481140137,
"step": 3600
},
{
"epoch": 1.0279043280182232,
"grad_norm": 15.844207763671875,
"learning_rate": 1.4003376993166288e-05,
"logits/chosen": 0.5763198137283325,
"logits/rejected": 0.530194103717804,
"logps/chosen": -275.58245849609375,
"logps/rejected": -299.5401916503906,
"loss": 0.4687,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -8.525606155395508,
"rewards/margins": 2.2367708683013916,
"rewards/rejected": -10.76237678527832,
"step": 3610
},
{
"epoch": 1.030751708428246,
"grad_norm": 15.936345100402832,
"learning_rate": 1.4003109339407746e-05,
"logits/chosen": 0.6139250993728638,
"logits/rejected": 0.5907430648803711,
"logps/chosen": -277.6994323730469,
"logps/rejected": -305.0992126464844,
"loss": 0.3596,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -8.412062644958496,
"rewards/margins": 3.1548120975494385,
"rewards/rejected": -11.566877365112305,
"step": 3620
},
{
"epoch": 1.033599088838269,
"grad_norm": 19.53025245666504,
"learning_rate": 1.4002841685649205e-05,
"logits/chosen": 0.5422394871711731,
"logits/rejected": 0.5309593081474304,
"logps/chosen": -286.34417724609375,
"logps/rejected": -308.42584228515625,
"loss": 0.4412,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -9.668391227722168,
"rewards/margins": 2.4024975299835205,
"rewards/rejected": -12.07088851928711,
"step": 3630
},
{
"epoch": 1.0364464692482915,
"grad_norm": 16.09668731689453,
"learning_rate": 1.4002574031890661e-05,
"logits/chosen": 0.7884154915809631,
"logits/rejected": 0.7050036787986755,
"logps/chosen": -279.5062561035156,
"logps/rejected": -309.83856201171875,
"loss": 0.2377,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -8.75977897644043,
"rewards/margins": 3.1992735862731934,
"rewards/rejected": -11.959052085876465,
"step": 3640
},
{
"epoch": 1.0392938496583144,
"grad_norm": 18.160032272338867,
"learning_rate": 1.400230637813212e-05,
"logits/chosen": 0.794320285320282,
"logits/rejected": 0.7702603340148926,
"logps/chosen": -282.390869140625,
"logps/rejected": -307.16943359375,
"loss": 0.6078,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -8.865792274475098,
"rewards/margins": 2.653172016143799,
"rewards/rejected": -11.518964767456055,
"step": 3650
},
{
"epoch": 1.042141230068337,
"grad_norm": 17.57752227783203,
"learning_rate": 1.4002038724373577e-05,
"logits/chosen": 0.8936277627944946,
"logits/rejected": 0.8679503202438354,
"logps/chosen": -264.1590881347656,
"logps/rejected": -295.1584167480469,
"loss": 0.3022,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -7.652402400970459,
"rewards/margins": 2.8865489959716797,
"rewards/rejected": -10.53895092010498,
"step": 3660
},
{
"epoch": 1.04498861047836,
"grad_norm": 10.988320350646973,
"learning_rate": 1.4001771070615036e-05,
"logits/chosen": 0.6470170021057129,
"logits/rejected": 0.5980864763259888,
"logps/chosen": -278.0357971191406,
"logps/rejected": -301.3656005859375,
"loss": 0.2475,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -8.397979736328125,
"rewards/margins": 2.759563446044922,
"rewards/rejected": -11.157544136047363,
"step": 3670
},
{
"epoch": 1.0478359908883828,
"grad_norm": 21.041053771972656,
"learning_rate": 1.4001503416856494e-05,
"logits/chosen": 0.8440157771110535,
"logits/rejected": 0.8416398167610168,
"logps/chosen": -280.1607666015625,
"logps/rejected": -308.46673583984375,
"loss": 0.4413,
"rewards/accuracies": 0.8166666030883789,
"rewards/chosen": -9.083349227905273,
"rewards/margins": 2.9152331352233887,
"rewards/rejected": -11.998581886291504,
"step": 3680
},
{
"epoch": 1.0506833712984054,
"grad_norm": 34.18033981323242,
"learning_rate": 1.400123576309795e-05,
"logits/chosen": 1.1155614852905273,
"logits/rejected": 1.037345051765442,
"logps/chosen": -291.4377136230469,
"logps/rejected": -315.4261169433594,
"loss": 0.2893,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -9.966141700744629,
"rewards/margins": 2.7352890968322754,
"rewards/rejected": -12.701430320739746,
"step": 3690
},
{
"epoch": 1.0535307517084282,
"grad_norm": 22.951887130737305,
"learning_rate": 1.4000968109339408e-05,
"logits/chosen": 0.8897800445556641,
"logits/rejected": 0.8015605211257935,
"logps/chosen": -282.42718505859375,
"logps/rejected": -311.1539001464844,
"loss": 0.3469,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -9.409584045410156,
"rewards/margins": 2.546072483062744,
"rewards/rejected": -11.955656051635742,
"step": 3700
},
{
"epoch": 1.056378132118451,
"grad_norm": 26.796382904052734,
"learning_rate": 1.4000700455580865e-05,
"logits/chosen": 0.6857692003250122,
"logits/rejected": 0.6226261258125305,
"logps/chosen": -282.7877197265625,
"logps/rejected": -315.45367431640625,
"loss": 0.294,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -9.390748977661133,
"rewards/margins": 3.2603118419647217,
"rewards/rejected": -12.6510591506958,
"step": 3710
},
{
"epoch": 1.0592255125284737,
"grad_norm": 33.42181396484375,
"learning_rate": 1.4000432801822323e-05,
"logits/chosen": 0.4234946668148041,
"logits/rejected": 0.3805975317955017,
"logps/chosen": -296.66644287109375,
"logps/rejected": -319.873291015625,
"loss": 0.388,
"rewards/accuracies": 0.8500000834465027,
"rewards/chosen": -10.75451374053955,
"rewards/margins": 2.3954834938049316,
"rewards/rejected": -13.149996757507324,
"step": 3720
},
{
"epoch": 1.0620728929384966,
"grad_norm": 8.873187065124512,
"learning_rate": 1.4000165148063781e-05,
"logits/chosen": 0.8379060626029968,
"logits/rejected": 0.8032379150390625,
"logps/chosen": -280.6669616699219,
"logps/rejected": -309.33251953125,
"loss": 0.3693,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -8.943597793579102,
"rewards/margins": 2.9501333236694336,
"rewards/rejected": -11.893732070922852,
"step": 3730
},
{
"epoch": 1.0649202733485195,
"grad_norm": 17.75446891784668,
"learning_rate": 1.399989749430524e-05,
"logits/chosen": 0.4959283769130707,
"logits/rejected": 0.4724133014678955,
"logps/chosen": -288.1912841796875,
"logps/rejected": -308.560546875,
"loss": 0.4123,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -9.21948528289795,
"rewards/margins": 2.6541736125946045,
"rewards/rejected": -11.873659133911133,
"step": 3740
},
{
"epoch": 1.067767653758542,
"grad_norm": 21.715051651000977,
"learning_rate": 1.3999629840546698e-05,
"logits/chosen": 0.9915273785591125,
"logits/rejected": 0.9026697278022766,
"logps/chosen": -288.552490234375,
"logps/rejected": -325.791015625,
"loss": 0.2591,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -9.927160263061523,
"rewards/margins": 3.521343946456909,
"rewards/rejected": -13.448504447937012,
"step": 3750
},
{
"epoch": 1.070615034168565,
"grad_norm": 8.18878173828125,
"learning_rate": 1.3999362186788156e-05,
"logits/chosen": 0.8200756907463074,
"logits/rejected": 0.7880581617355347,
"logps/chosen": -281.2442932128906,
"logps/rejected": -309.26300048828125,
"loss": 0.3599,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -9.674338340759277,
"rewards/margins": 2.2603495121002197,
"rewards/rejected": -11.934687614440918,
"step": 3760
},
{
"epoch": 1.0734624145785876,
"grad_norm": 11.345477104187012,
"learning_rate": 1.3999094533029614e-05,
"logits/chosen": 0.6193078756332397,
"logits/rejected": 0.5920727849006653,
"logps/chosen": -297.87579345703125,
"logps/rejected": -324.90875244140625,
"loss": 0.3291,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -10.389954566955566,
"rewards/margins": 3.0738964080810547,
"rewards/rejected": -13.463849067687988,
"step": 3770
},
{
"epoch": 1.0763097949886105,
"grad_norm": 12.467089653015137,
"learning_rate": 1.399882687927107e-05,
"logits/chosen": 0.6029736995697021,
"logits/rejected": 0.5278339385986328,
"logps/chosen": -287.04547119140625,
"logps/rejected": -313.1669006347656,
"loss": 0.2722,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -9.467456817626953,
"rewards/margins": 3.074500322341919,
"rewards/rejected": -12.54195785522461,
"step": 3780
},
{
"epoch": 1.0791571753986333,
"grad_norm": 5.360784530639648,
"learning_rate": 1.3998559225512529e-05,
"logits/chosen": 0.5335519313812256,
"logits/rejected": 0.5010834336280823,
"logps/chosen": -273.48895263671875,
"logps/rejected": -307.15576171875,
"loss": 0.3501,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -8.458849906921387,
"rewards/margins": 3.136683940887451,
"rewards/rejected": -11.59553337097168,
"step": 3790
},
{
"epoch": 1.082004555808656,
"grad_norm": 23.285932540893555,
"learning_rate": 1.3998291571753987e-05,
"logits/chosen": 0.4613843858242035,
"logits/rejected": 0.4110085070133209,
"logps/chosen": -275.67706298828125,
"logps/rejected": -303.83782958984375,
"loss": 0.3489,
"rewards/accuracies": 0.8333333730697632,
"rewards/chosen": -8.685543060302734,
"rewards/margins": 2.5913987159729004,
"rewards/rejected": -11.276942253112793,
"step": 3800
},
{
"epoch": 1.0848519362186788,
"grad_norm": 18.05284309387207,
"learning_rate": 1.3998023917995445e-05,
"logits/chosen": 0.7175520658493042,
"logits/rejected": 0.6548722386360168,
"logps/chosen": -279.5047912597656,
"logps/rejected": -309.22479248046875,
"loss": 0.3255,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -9.209101676940918,
"rewards/margins": 3.0299510955810547,
"rewards/rejected": -12.239053726196289,
"step": 3810
},
{
"epoch": 1.0876993166287017,
"grad_norm": 15.506644248962402,
"learning_rate": 1.3997756264236903e-05,
"logits/chosen": 0.8500604629516602,
"logits/rejected": 0.8025990724563599,
"logps/chosen": -281.86053466796875,
"logps/rejected": -311.606201171875,
"loss": 0.3359,
"rewards/accuracies": 0.8500000834465027,
"rewards/chosen": -9.722951889038086,
"rewards/margins": 2.3718514442443848,
"rewards/rejected": -12.094802856445312,
"step": 3820
},
{
"epoch": 1.0905466970387243,
"grad_norm": 3.2531468868255615,
"learning_rate": 1.3997488610478361e-05,
"logits/chosen": 0.9165185689926147,
"logits/rejected": 0.8282724618911743,
"logps/chosen": -278.8956604003906,
"logps/rejected": -312.00750732421875,
"loss": 0.2895,
"rewards/accuracies": 0.8833333849906921,
"rewards/chosen": -8.979422569274902,
"rewards/margins": 3.4525809288024902,
"rewards/rejected": -12.432002067565918,
"step": 3830
},
{
"epoch": 1.0933940774487472,
"grad_norm": 8.112292289733887,
"learning_rate": 1.399722095671982e-05,
"logits/chosen": 1.0575357675552368,
"logits/rejected": 1.013039469718933,
"logps/chosen": -280.9009094238281,
"logps/rejected": -303.0403747558594,
"loss": 0.3678,
"rewards/accuracies": 0.7833333015441895,
"rewards/chosen": -8.741242408752441,
"rewards/margins": 2.594489574432373,
"rewards/rejected": -11.335733413696289,
"step": 3840
},
{
"epoch": 1.09624145785877,
"grad_norm": 3.0322318077087402,
"learning_rate": 1.3996953302961276e-05,
"logits/chosen": 0.8057734370231628,
"logits/rejected": 0.7428101301193237,
"logps/chosen": -293.65765380859375,
"logps/rejected": -322.9544982910156,
"loss": 0.3457,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -10.046621322631836,
"rewards/margins": 2.509535312652588,
"rewards/rejected": -12.556157112121582,
"step": 3850
},
{
"epoch": 1.0990888382687927,
"grad_norm": 13.537039756774902,
"learning_rate": 1.3996685649202734e-05,
"logits/chosen": 0.8179152607917786,
"logits/rejected": 0.7628077268600464,
"logps/chosen": -309.3814392089844,
"logps/rejected": -333.23236083984375,
"loss": 0.3651,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -11.620147705078125,
"rewards/margins": 2.818713426589966,
"rewards/rejected": -14.438860893249512,
"step": 3860
},
{
"epoch": 1.1019362186788155,
"grad_norm": 15.105693817138672,
"learning_rate": 1.3996417995444192e-05,
"logits/chosen": 0.8153516054153442,
"logits/rejected": 0.7349362373352051,
"logps/chosen": -316.74114990234375,
"logps/rejected": -343.94329833984375,
"loss": 0.2984,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -12.85276985168457,
"rewards/margins": 2.6356797218322754,
"rewards/rejected": -15.48845100402832,
"step": 3870
},
{
"epoch": 1.1047835990888384,
"grad_norm": 9.755592346191406,
"learning_rate": 1.399615034168565e-05,
"logits/chosen": 0.6579657793045044,
"logits/rejected": 0.5918071269989014,
"logps/chosen": -309.8885498046875,
"logps/rejected": -337.7580261230469,
"loss": 0.385,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -11.838005065917969,
"rewards/margins": 3.0567147731781006,
"rewards/rejected": -14.894720077514648,
"step": 3880
},
{
"epoch": 1.107630979498861,
"grad_norm": 19.806747436523438,
"learning_rate": 1.3995882687927109e-05,
"logits/chosen": 1.0924065113067627,
"logits/rejected": 1.0375772714614868,
"logps/chosen": -309.6856384277344,
"logps/rejected": -335.0812683105469,
"loss": 0.2217,
"rewards/accuracies": 0.8833333253860474,
"rewards/chosen": -11.95275592803955,
"rewards/margins": 2.838541269302368,
"rewards/rejected": -14.791296005249023,
"step": 3890
},
{
"epoch": 1.1104783599088839,
"grad_norm": 20.042261123657227,
"learning_rate": 1.3995615034168567e-05,
"logits/chosen": 0.5404999256134033,
"logits/rejected": 0.5026549100875854,
"logps/chosen": -308.2242736816406,
"logps/rejected": -331.86944580078125,
"loss": 0.3761,
"rewards/accuracies": 0.8500000834465027,
"rewards/chosen": -12.19024658203125,
"rewards/margins": 2.2361843585968018,
"rewards/rejected": -14.426431655883789,
"step": 3900
},
{
"epoch": 1.1133257403189065,
"grad_norm": 9.1209135055542,
"learning_rate": 1.3995347380410023e-05,
"logits/chosen": 1.2070353031158447,
"logits/rejected": 1.1822826862335205,
"logps/chosen": -312.82049560546875,
"logps/rejected": -336.69305419921875,
"loss": 0.3917,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -11.746156692504883,
"rewards/margins": 3.025268793106079,
"rewards/rejected": -14.77142333984375,
"step": 3910
},
{
"epoch": 1.1161731207289294,
"grad_norm": 5.072232723236084,
"learning_rate": 1.399507972665148e-05,
"logits/chosen": 0.7343096137046814,
"logits/rejected": 0.7266718149185181,
"logps/chosen": -295.480712890625,
"logps/rejected": -331.20391845703125,
"loss": 0.2191,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -10.570296287536621,
"rewards/margins": 3.5358786582946777,
"rewards/rejected": -14.106175422668457,
"step": 3920
},
{
"epoch": 1.1190205011389522,
"grad_norm": 15.483905792236328,
"learning_rate": 1.3994812072892938e-05,
"logits/chosen": 0.7368067502975464,
"logits/rejected": 0.6594582796096802,
"logps/chosen": -284.6335144042969,
"logps/rejected": -315.6561584472656,
"loss": 0.2343,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -10.035016059875488,
"rewards/margins": 3.443873167037964,
"rewards/rejected": -13.478889465332031,
"step": 3930
},
{
"epoch": 1.1218678815489749,
"grad_norm": 20.447673797607422,
"learning_rate": 1.3994544419134396e-05,
"logits/chosen": 0.6332524418830872,
"logits/rejected": 0.606482207775116,
"logps/chosen": -290.54254150390625,
"logps/rejected": -317.78076171875,
"loss": 0.5733,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -10.31615161895752,
"rewards/margins": 2.5428757667541504,
"rewards/rejected": -12.859028816223145,
"step": 3940
},
{
"epoch": 1.1247152619589977,
"grad_norm": 15.281331062316895,
"learning_rate": 1.3994276765375854e-05,
"logits/chosen": 1.1297801733016968,
"logits/rejected": 1.0745735168457031,
"logps/chosen": -300.40338134765625,
"logps/rejected": -326.6527404785156,
"loss": 0.4481,
"rewards/accuracies": 0.8000000715255737,
"rewards/chosen": -11.181158065795898,
"rewards/margins": 2.283782482147217,
"rewards/rejected": -13.464941024780273,
"step": 3950
},
{
"epoch": 1.1275626423690206,
"grad_norm": 17.650279998779297,
"learning_rate": 1.3994009111617312e-05,
"logits/chosen": 0.7293352484703064,
"logits/rejected": 0.7268023490905762,
"logps/chosen": -323.0600280761719,
"logps/rejected": -347.69049072265625,
"loss": 0.5341,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -13.055007934570312,
"rewards/margins": 2.4304747581481934,
"rewards/rejected": -15.485481262207031,
"step": 3960
},
{
"epoch": 1.1304100227790432,
"grad_norm": 13.10045337677002,
"learning_rate": 1.399374145785877e-05,
"logits/chosen": 0.9735004305839539,
"logits/rejected": 0.9148595929145813,
"logps/chosen": -318.93878173828125,
"logps/rejected": -341.3691101074219,
"loss": 0.4774,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -12.5765380859375,
"rewards/margins": 2.1414952278137207,
"rewards/rejected": -14.718032836914062,
"step": 3970
},
{
"epoch": 1.133257403189066,
"grad_norm": 2.9735374450683594,
"learning_rate": 1.3993473804100229e-05,
"logits/chosen": 0.705932080745697,
"logits/rejected": 0.645165205001831,
"logps/chosen": -325.68902587890625,
"logps/rejected": -348.7044677734375,
"loss": 0.3736,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -13.175191879272461,
"rewards/margins": 2.545431613922119,
"rewards/rejected": -15.720623970031738,
"step": 3980
},
{
"epoch": 1.1361047835990887,
"grad_norm": 21.49675750732422,
"learning_rate": 1.3993206150341685e-05,
"logits/chosen": 0.7589794397354126,
"logits/rejected": 0.7352560758590698,
"logps/chosen": -317.130859375,
"logps/rejected": -333.79876708984375,
"loss": 0.5117,
"rewards/accuracies": 0.75,
"rewards/chosen": -12.715624809265137,
"rewards/margins": 1.7535444498062134,
"rewards/rejected": -14.469167709350586,
"step": 3990
},
{
"epoch": 1.1389521640091116,
"grad_norm": 22.71150016784668,
"learning_rate": 1.3992938496583143e-05,
"logits/chosen": 0.7460082769393921,
"logits/rejected": 0.7586153745651245,
"logps/chosen": -302.3631896972656,
"logps/rejected": -322.42266845703125,
"loss": 0.3693,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -11.592964172363281,
"rewards/margins": 2.2121787071228027,
"rewards/rejected": -13.805142402648926,
"step": 4000
}
],
"logging_steps": 10,
"max_steps": 526800,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}