{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1809861234130499, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002952465308532625, "grad_norm": 1.5939877033233643, "learning_rate": 1.4099722468260998e-05, "logits/chosen": 0.8489359617233276, "logits/rejected": 0.8399260640144348, "logps/chosen": -189.28167724609375, "logps/rejected": -187.5765838623047, "loss": 0.6931, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 0.009952417574822903, "rewards/margins": 0.00019365949265193194, "rewards/rejected": 0.00975875835865736, "step": 10 }, { "epoch": 0.00590493061706525, "grad_norm": 2.012295722961426, "learning_rate": 1.4099444936521996e-05, "logits/chosen": 0.619053304195404, "logits/rejected": 0.6149767637252808, "logps/chosen": -189.75486755371094, "logps/rejected": -193.75283813476562, "loss": 0.6936, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.00567560875788331, "rewards/margins": -0.0006045278278179467, "rewards/rejected": -0.005071080289781094, "step": 20 }, { "epoch": 0.008857395925597875, "grad_norm": 1.7735213041305542, "learning_rate": 1.4099167404782995e-05, "logits/chosen": 0.9022265672683716, "logits/rejected": 0.9175864458084106, "logps/chosen": -193.29150390625, "logps/rejected": -190.18130493164062, "loss": 0.6946, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -0.014524741098284721, "rewards/margins": -0.002716802293434739, "rewards/rejected": -0.011807938106358051, "step": 30 }, { "epoch": 0.0118098612341305, "grad_norm": 1.7005783319473267, "learning_rate": 1.4098889873043992e-05, "logits/chosen": 0.9683588743209839, "logits/rejected": 0.9722894430160522, "logps/chosen": -190.8477325439453, "logps/rejected": -192.9807891845703, "loss": 0.6862, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -0.004277267958968878, "rewards/margins": 0.014298351481556892, "rewards/rejected": -0.018575619906187057, "step": 40 }, { "epoch": 0.014762326542663124, "grad_norm": 2.3944554328918457, "learning_rate": 1.409861234130499e-05, "logits/chosen": 0.8564046621322632, "logits/rejected": 0.8505142331123352, "logps/chosen": -197.9243621826172, "logps/rejected": -197.97486877441406, "loss": 0.6837, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00921831838786602, "rewards/margins": 0.01952509582042694, "rewards/rejected": -0.010306778363883495, "step": 50 }, { "epoch": 0.01771479185119575, "grad_norm": 2.062191963195801, "learning_rate": 1.4098334809565988e-05, "logits/chosen": 0.7493852972984314, "logits/rejected": 0.7596369981765747, "logps/chosen": -197.7070770263672, "logps/rejected": -197.5248260498047, "loss": 0.6775, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.007310180459171534, "rewards/margins": 0.03376404196023941, "rewards/rejected": -0.04107422009110451, "step": 60 }, { "epoch": 0.020667257159728374, "grad_norm": 1.8768013715744019, "learning_rate": 1.4098057277826987e-05, "logits/chosen": 0.775458812713623, "logits/rejected": 0.7668648958206177, "logps/chosen": -195.34515380859375, "logps/rejected": -193.77633666992188, "loss": 0.6639, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.02067253366112709, "rewards/margins": 0.06504613161087036, "rewards/rejected": -0.08571867644786835, "step": 70 }, { "epoch": 0.023619722468261, "grad_norm": 2.1941370964050293, "learning_rate": 1.4097779746087984e-05, "logits/chosen": 0.8050420880317688, "logits/rejected": 0.8041003346443176, "logps/chosen": -191.7614288330078, "logps/rejected": -194.5349884033203, "loss": 0.6878, "rewards/accuracies": 0.5499999523162842, "rewards/chosen": -0.06866570562124252, "rewards/margins": 0.01630706712603569, "rewards/rejected": -0.08497275412082672, "step": 80 }, { "epoch": 0.026572187776793623, "grad_norm": 2.083845615386963, "learning_rate": 1.4097502214348981e-05, "logits/chosen": 0.7442329525947571, "logits/rejected": 0.7301486134529114, "logps/chosen": -193.15359497070312, "logps/rejected": -197.1804656982422, "loss": 0.6841, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -0.1024138331413269, "rewards/margins": 0.02556241676211357, "rewards/rejected": -0.12797626852989197, "step": 90 }, { "epoch": 0.029524653085326247, "grad_norm": 1.9674664735794067, "learning_rate": 1.409722468260998e-05, "logits/chosen": 1.0109026432037354, "logits/rejected": 1.0027401447296143, "logps/chosen": -190.721435546875, "logps/rejected": -188.56802368164062, "loss": 0.6634, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.09300872683525085, "rewards/margins": 0.07284916937351227, "rewards/rejected": -0.16585791110992432, "step": 100 }, { "epoch": 0.032477118393858875, "grad_norm": 2.3729922771453857, "learning_rate": 1.4096947150870978e-05, "logits/chosen": 0.8779767751693726, "logits/rejected": 0.8676374554634094, "logps/chosen": -192.65536499023438, "logps/rejected": -195.64593505859375, "loss": 0.6739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05633270740509033, "rewards/margins": 0.04749976471066475, "rewards/rejected": -0.10383248329162598, "step": 110 }, { "epoch": 0.0354295837023915, "grad_norm": 2.5210423469543457, "learning_rate": 1.4096669619131975e-05, "logits/chosen": 0.802282452583313, "logits/rejected": 0.8133144378662109, "logps/chosen": -190.34791564941406, "logps/rejected": -194.04653930664062, "loss": 0.6749, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.038606930524110794, "rewards/margins": 0.05979428440332413, "rewards/rejected": -0.09840121120214462, "step": 120 }, { "epoch": 0.038382049010924124, "grad_norm": 2.7662322521209717, "learning_rate": 1.4096392087392974e-05, "logits/chosen": 0.6787645816802979, "logits/rejected": 0.6753560304641724, "logps/chosen": -192.43038940429688, "logps/rejected": -196.47244262695312, "loss": 0.6778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.019857464358210564, "rewards/margins": 0.053189296275377274, "rewards/rejected": -0.03333183377981186, "step": 130 }, { "epoch": 0.04133451431945675, "grad_norm": 4.272401809692383, "learning_rate": 1.4096114555653971e-05, "logits/chosen": 0.7471339702606201, "logits/rejected": 0.7372208833694458, "logps/chosen": -192.8908233642578, "logps/rejected": -194.90721130371094, "loss": 0.6781, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.010477876290678978, "rewards/margins": 0.058838438242673874, "rewards/rejected": -0.04836054891347885, "step": 140 }, { "epoch": 0.04428697962798937, "grad_norm": 3.688913583755493, "learning_rate": 1.409583702391497e-05, "logits/chosen": 0.8924674987792969, "logits/rejected": 0.895649254322052, "logps/chosen": -193.98822021484375, "logps/rejected": -189.799560546875, "loss": 0.6644, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.08852823823690414, "rewards/margins": 0.08099732547998428, "rewards/rejected": 0.007530921138823032, "step": 150 }, { "epoch": 0.047239444936522, "grad_norm": 3.6807966232299805, "learning_rate": 1.4095559492175967e-05, "logits/chosen": 0.6237049102783203, "logits/rejected": 0.6276236772537231, "logps/chosen": -194.7342987060547, "logps/rejected": -186.8376922607422, "loss": 0.6577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.022360483184456825, "rewards/margins": 0.09028097242116928, "rewards/rejected": -0.0679204910993576, "step": 160 }, { "epoch": 0.05019191024505462, "grad_norm": 2.9169344902038574, "learning_rate": 1.4095281960436966e-05, "logits/chosen": 0.8032541275024414, "logits/rejected": 0.8041223287582397, "logps/chosen": -188.31300354003906, "logps/rejected": -192.86968994140625, "loss": 0.7037, "rewards/accuracies": 0.4833332896232605, "rewards/chosen": 0.06922253221273422, "rewards/margins": -0.00155611929949373, "rewards/rejected": 0.07077865302562714, "step": 170 }, { "epoch": 0.053144375553587246, "grad_norm": 3.907766103744507, "learning_rate": 1.4095004428697963e-05, "logits/chosen": 1.0397979021072388, "logits/rejected": 1.0484521389007568, "logps/chosen": -189.82879638671875, "logps/rejected": -192.89456176757812, "loss": 0.627, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": 0.19225101172924042, "rewards/margins": 0.16633881628513336, "rewards/rejected": 0.025912201032042503, "step": 180 }, { "epoch": 0.05609684086211987, "grad_norm": 3.420940637588501, "learning_rate": 1.4094726896958962e-05, "logits/chosen": 0.9639438390731812, "logits/rejected": 0.9556954503059387, "logps/chosen": -191.4084014892578, "logps/rejected": -194.2454833984375, "loss": 0.694, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.08127955347299576, "rewards/margins": 0.032605238258838654, "rewards/rejected": 0.0486743226647377, "step": 190 }, { "epoch": 0.059049306170652495, "grad_norm": 6.557375907897949, "learning_rate": 1.4094449365219959e-05, "logits/chosen": 0.8427084684371948, "logits/rejected": 0.8574526906013489, "logps/chosen": -194.78231811523438, "logps/rejected": -196.427001953125, "loss": 0.6812, "rewards/accuracies": 0.6833332777023315, "rewards/chosen": 0.07793475687503815, "rewards/margins": 0.057343851774930954, "rewards/rejected": 0.02059089206159115, "step": 200 }, { "epoch": 0.06200177147918512, "grad_norm": 3.6481292247772217, "learning_rate": 1.4094171833480957e-05, "logits/chosen": 0.8689468502998352, "logits/rejected": 0.862633228302002, "logps/chosen": -198.5911407470703, "logps/rejected": -191.7371368408203, "loss": 0.6297, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": 0.10686226934194565, "rewards/margins": 0.15873855352401733, "rewards/rejected": -0.05187627673149109, "step": 210 }, { "epoch": 0.06495423678771775, "grad_norm": 3.8536431789398193, "learning_rate": 1.4093894301741955e-05, "logits/chosen": 0.8399683833122253, "logits/rejected": 0.8384534120559692, "logps/chosen": -193.53671264648438, "logps/rejected": -193.4778594970703, "loss": 0.6457, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.0668548196554184, "rewards/margins": 0.1318102777004242, "rewards/rejected": -0.0649554654955864, "step": 220 }, { "epoch": 0.06790670209625037, "grad_norm": 3.762707233428955, "learning_rate": 1.4093616770002953e-05, "logits/chosen": 0.8894673585891724, "logits/rejected": 0.8852788209915161, "logps/chosen": -195.80856323242188, "logps/rejected": -197.34353637695312, "loss": 0.679, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.08987792581319809, "rewards/margins": 0.06133050471544266, "rewards/rejected": 0.028547415509819984, "step": 230 }, { "epoch": 0.070859167404783, "grad_norm": 3.0046496391296387, "learning_rate": 1.4093339238263952e-05, "logits/chosen": 0.7518049478530884, "logits/rejected": 0.75348299741745, "logps/chosen": -188.33114624023438, "logps/rejected": -189.12734985351562, "loss": 0.6184, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.09577003121376038, "rewards/margins": 0.19026298820972443, "rewards/rejected": -0.09449295699596405, "step": 240 }, { "epoch": 0.07381163271331562, "grad_norm": 5.246212482452393, "learning_rate": 1.4093061706524949e-05, "logits/chosen": 0.9287120699882507, "logits/rejected": 0.9245287179946899, "logps/chosen": -189.4251708984375, "logps/rejected": -193.7753143310547, "loss": 0.6497, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.09206505864858627, "rewards/margins": 0.13260667026042938, "rewards/rejected": -0.040541619062423706, "step": 250 }, { "epoch": 0.07676409802184825, "grad_norm": 3.0860815048217773, "learning_rate": 1.4092784174785946e-05, "logits/chosen": 0.8246654272079468, "logits/rejected": 0.8177183866500854, "logps/chosen": -187.14111328125, "logps/rejected": -195.39491271972656, "loss": 0.5915, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": 0.13858284056186676, "rewards/margins": 0.25852078199386597, "rewards/rejected": -0.11993791162967682, "step": 260 }, { "epoch": 0.07971656333038087, "grad_norm": 4.372045040130615, "learning_rate": 1.4092506643046945e-05, "logits/chosen": 1.063962697982788, "logits/rejected": 1.0670716762542725, "logps/chosen": -193.32443237304688, "logps/rejected": -192.55575561523438, "loss": 0.6796, "rewards/accuracies": 0.5500000715255737, "rewards/chosen": 0.1552315056324005, "rewards/margins": 0.07176170498132706, "rewards/rejected": 0.08346980810165405, "step": 270 }, { "epoch": 0.0826690286389135, "grad_norm": 3.5372889041900635, "learning_rate": 1.4092229111307944e-05, "logits/chosen": 0.6887931823730469, "logits/rejected": 0.6702864170074463, "logps/chosen": -184.47488403320312, "logps/rejected": -191.4892578125, "loss": 0.6173, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": 0.32319122552871704, "rewards/margins": 0.24527156352996826, "rewards/rejected": 0.07791964709758759, "step": 280 }, { "epoch": 0.08562149394744611, "grad_norm": 3.4411633014678955, "learning_rate": 1.409195157956894e-05, "logits/chosen": 0.8443145751953125, "logits/rejected": 0.8365300297737122, "logps/chosen": -185.31179809570312, "logps/rejected": -190.5987548828125, "loss": 0.6046, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": 0.10719159990549088, "rewards/margins": 0.25746795535087585, "rewards/rejected": -0.1502763330936432, "step": 290 }, { "epoch": 0.08857395925597875, "grad_norm": 3.999758243560791, "learning_rate": 1.4091674047829938e-05, "logits/chosen": 0.6611593961715698, "logits/rejected": 0.6465965509414673, "logps/chosen": -186.2289581298828, "logps/rejected": -194.2019500732422, "loss": 0.6017, "rewards/accuracies": 0.75, "rewards/chosen": 0.4272712767124176, "rewards/margins": 0.2914228141307831, "rewards/rejected": 0.13584844768047333, "step": 300 }, { "epoch": 0.09152642456451136, "grad_norm": 4.495447158813477, "learning_rate": 1.4091396516090937e-05, "logits/chosen": 0.9295717477798462, "logits/rejected": 0.9223917126655579, "logps/chosen": -189.1553497314453, "logps/rejected": -194.3542938232422, "loss": 0.5437, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.3489242196083069, "rewards/margins": 0.39379146695137024, "rewards/rejected": -0.044867224991321564, "step": 310 }, { "epoch": 0.094478889873044, "grad_norm": 5.714067459106445, "learning_rate": 1.4091118984351935e-05, "logits/chosen": 0.8517519235610962, "logits/rejected": 0.8555078506469727, "logps/chosen": -188.68185424804688, "logps/rejected": -192.70802307128906, "loss": 0.6018, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 0.2350027859210968, "rewards/margins": 0.2580421566963196, "rewards/rejected": -0.023039374500513077, "step": 320 }, { "epoch": 0.09743135518157661, "grad_norm": 5.241026401519775, "learning_rate": 1.4090841452612932e-05, "logits/chosen": 0.6248332858085632, "logits/rejected": 0.6356890797615051, "logps/chosen": -192.17160034179688, "logps/rejected": -194.1638641357422, "loss": 0.6551, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.16749046742916107, "rewards/margins": 0.17421643435955048, "rewards/rejected": -0.006725951097905636, "step": 330 }, { "epoch": 0.10038382049010924, "grad_norm": 4.596611022949219, "learning_rate": 1.409056392087393e-05, "logits/chosen": 1.0997419357299805, "logits/rejected": 1.1025121212005615, "logps/chosen": -190.0499267578125, "logps/rejected": -189.81427001953125, "loss": 0.5501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.22568830847740173, "rewards/margins": 0.3878404498100281, "rewards/rejected": -0.16215215623378754, "step": 340 }, { "epoch": 0.10333628579864186, "grad_norm": 4.661524295806885, "learning_rate": 1.4090286389134928e-05, "logits/chosen": 0.9030084609985352, "logits/rejected": 0.9074040651321411, "logps/chosen": -198.10415649414062, "logps/rejected": -196.96624755859375, "loss": 0.6571, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -0.3411143124103546, "rewards/margins": 0.1762751042842865, "rewards/rejected": -0.5173894166946411, "step": 350 }, { "epoch": 0.10628875110717449, "grad_norm": 4.915902137756348, "learning_rate": 1.4090008857395927e-05, "logits/chosen": 1.064614176750183, "logits/rejected": 1.058632493019104, "logps/chosen": -197.1112823486328, "logps/rejected": -203.41201782226562, "loss": 0.5755, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": -0.18287493288516998, "rewards/margins": 0.32825222611427307, "rewards/rejected": -0.5111271739006042, "step": 360 }, { "epoch": 0.10924121641570711, "grad_norm": 5.913583278656006, "learning_rate": 1.4089731325656924e-05, "logits/chosen": 1.1067864894866943, "logits/rejected": 1.1033415794372559, "logps/chosen": -198.59512329101562, "logps/rejected": -201.04820251464844, "loss": 0.6275, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2866785526275635, "rewards/margins": 0.2042660415172577, "rewards/rejected": -0.4909445643424988, "step": 370 }, { "epoch": 0.11219368172423974, "grad_norm": 4.172982215881348, "learning_rate": 1.4089453793917921e-05, "logits/chosen": 0.6820765733718872, "logits/rejected": 0.6808046698570251, "logps/chosen": -190.4650115966797, "logps/rejected": -196.33065795898438, "loss": 0.5756, "rewards/accuracies": 0.7000001072883606, "rewards/chosen": -0.13575351238250732, "rewards/margins": 0.4057921767234802, "rewards/rejected": -0.5415457487106323, "step": 380 }, { "epoch": 0.11514614703277236, "grad_norm": 6.88010835647583, "learning_rate": 1.408917626217892e-05, "logits/chosen": 0.7773826718330383, "logits/rejected": 0.7827831506729126, "logps/chosen": -192.75234985351562, "logps/rejected": -195.40585327148438, "loss": 0.5911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.052654195576906204, "rewards/margins": 0.3769793212413788, "rewards/rejected": -0.3243251442909241, "step": 390 }, { "epoch": 0.11809861234130499, "grad_norm": 5.4537353515625, "learning_rate": 1.4088898730439919e-05, "logits/chosen": 0.9932680130004883, "logits/rejected": 1.0090506076812744, "logps/chosen": -194.18597412109375, "logps/rejected": -191.88424682617188, "loss": 0.6474, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": 0.14818041026592255, "rewards/margins": 0.2826191484928131, "rewards/rejected": -0.13443872332572937, "step": 400 }, { "epoch": 0.12105107764983762, "grad_norm": 4.787024974822998, "learning_rate": 1.4088621198700916e-05, "logits/chosen": 1.0814502239227295, "logits/rejected": 1.0768681764602661, "logps/chosen": -187.91326904296875, "logps/rejected": -190.1375274658203, "loss": 0.5517, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": 0.16685490310192108, "rewards/margins": 0.4738074839115143, "rewards/rejected": -0.306952565908432, "step": 410 }, { "epoch": 0.12400354295837024, "grad_norm": 5.797351360321045, "learning_rate": 1.4088343666961913e-05, "logits/chosen": 0.9605782628059387, "logits/rejected": 0.9518300890922546, "logps/chosen": -195.5142364501953, "logps/rejected": -198.172119140625, "loss": 0.5501, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.20397381484508514, "rewards/margins": 0.4489016532897949, "rewards/rejected": -0.6528754830360413, "step": 420 }, { "epoch": 0.12695600826690287, "grad_norm": 6.211864471435547, "learning_rate": 1.4088066135222911e-05, "logits/chosen": 1.0396544933319092, "logits/rejected": 1.0322144031524658, "logps/chosen": -197.70849609375, "logps/rejected": -198.70059204101562, "loss": 0.653, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2812974452972412, "rewards/margins": 0.22553567588329315, "rewards/rejected": -0.5068331360816956, "step": 430 }, { "epoch": 0.1299084735754355, "grad_norm": 13.63403034210205, "learning_rate": 1.408778860348391e-05, "logits/chosen": 0.8653265237808228, "logits/rejected": 0.8666241765022278, "logps/chosen": -192.80062866210938, "logps/rejected": -202.66195678710938, "loss": 0.5917, "rewards/accuracies": 0.6833334565162659, "rewards/chosen": -0.10747797787189484, "rewards/margins": 0.40785497426986694, "rewards/rejected": -0.5153329968452454, "step": 440 }, { "epoch": 0.1328609388839681, "grad_norm": 5.498272895812988, "learning_rate": 1.4087511071744907e-05, "logits/chosen": 0.9190497398376465, "logits/rejected": 0.8973766565322876, "logps/chosen": -196.6714324951172, "logps/rejected": -198.79244995117188, "loss": 0.4905, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.09642244875431061, "rewards/margins": 0.6152997016906738, "rewards/rejected": -0.7117221355438232, "step": 450 }, { "epoch": 0.13581340419250074, "grad_norm": 9.896668434143066, "learning_rate": 1.4087233540005906e-05, "logits/chosen": 0.8393335342407227, "logits/rejected": 0.8556126356124878, "logps/chosen": -197.34164428710938, "logps/rejected": -195.4841766357422, "loss": 0.5906, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.22712747752666473, "rewards/margins": 0.40703386068344116, "rewards/rejected": -0.6341613531112671, "step": 460 }, { "epoch": 0.13876586950103337, "grad_norm": 5.764206886291504, "learning_rate": 1.4086956008266903e-05, "logits/chosen": 0.6542803049087524, "logits/rejected": 0.6607316136360168, "logps/chosen": -193.38580322265625, "logps/rejected": -196.74093627929688, "loss": 0.521, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.24029307067394257, "rewards/margins": 0.5755614638328552, "rewards/rejected": -0.815854549407959, "step": 470 }, { "epoch": 0.141718334809566, "grad_norm": 5.485475063323975, "learning_rate": 1.4086678476527902e-05, "logits/chosen": 0.9456082582473755, "logits/rejected": 0.9408715963363647, "logps/chosen": -189.89215087890625, "logps/rejected": -199.4336395263672, "loss": 0.6002, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.017454860731959343, "rewards/margins": 0.3381180167198181, "rewards/rejected": -0.3555728793144226, "step": 480 }, { "epoch": 0.1446708001180986, "grad_norm": 7.72708797454834, "learning_rate": 1.4086400944788899e-05, "logits/chosen": 1.1940948963165283, "logits/rejected": 1.1848459243774414, "logps/chosen": -193.41622924804688, "logps/rejected": -196.96078491210938, "loss": 0.5906, "rewards/accuracies": 0.6833332777023315, "rewards/chosen": -0.021615929901599884, "rewards/margins": 0.39761602878570557, "rewards/rejected": -0.41923195123672485, "step": 490 }, { "epoch": 0.14762326542663123, "grad_norm": 10.70683479309082, "learning_rate": 1.4086123413049898e-05, "logits/chosen": 0.9918981790542603, "logits/rejected": 0.9840221405029297, "logps/chosen": -190.93740844726562, "logps/rejected": -197.23489379882812, "loss": 0.587, "rewards/accuracies": 0.7333334684371948, "rewards/chosen": 0.04888393357396126, "rewards/margins": 0.3439929485321045, "rewards/rejected": -0.29510897397994995, "step": 500 }, { "epoch": 0.15057573073516387, "grad_norm": 5.849313259124756, "learning_rate": 1.4085845881310895e-05, "logits/chosen": 1.0909477472305298, "logits/rejected": 1.0899497270584106, "logps/chosen": -188.38790893554688, "logps/rejected": -192.1708221435547, "loss": 0.5909, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10718566179275513, "rewards/margins": 0.41812753677368164, "rewards/rejected": -0.3109418451786041, "step": 510 }, { "epoch": 0.1535281960436965, "grad_norm": 7.840180397033691, "learning_rate": 1.4085568349571893e-05, "logits/chosen": 0.9468757510185242, "logits/rejected": 0.945979118347168, "logps/chosen": -193.09170532226562, "logps/rejected": -199.43043518066406, "loss": 0.6186, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10761149227619171, "rewards/margins": 0.3610059320926666, "rewards/rejected": -0.46861737966537476, "step": 520 }, { "epoch": 0.1564806613522291, "grad_norm": 4.524003982543945, "learning_rate": 1.408529081783289e-05, "logits/chosen": 0.9438020586967468, "logits/rejected": 0.9292134046554565, "logps/chosen": -196.98789978027344, "logps/rejected": -198.87074279785156, "loss": 0.5877, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.3431043028831482, "rewards/margins": 0.40301164984703064, "rewards/rejected": -0.7461159825325012, "step": 530 }, { "epoch": 0.15943312666076173, "grad_norm": 4.712222576141357, "learning_rate": 1.408501328609389e-05, "logits/chosen": 0.9429192543029785, "logits/rejected": 0.9369012713432312, "logps/chosen": -195.4066162109375, "logps/rejected": -199.2905731201172, "loss": 0.5738, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -0.2270047664642334, "rewards/margins": 0.451803594827652, "rewards/rejected": -0.6788083910942078, "step": 540 }, { "epoch": 0.16238559196929436, "grad_norm": 7.926249027252197, "learning_rate": 1.4084735754354886e-05, "logits/chosen": 1.0680028200149536, "logits/rejected": 1.06380033493042, "logps/chosen": -193.03369140625, "logps/rejected": -200.2340545654297, "loss": 0.4624, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.1726120412349701, "rewards/margins": 0.8310006856918335, "rewards/rejected": -1.003612756729126, "step": 550 }, { "epoch": 0.165338057277827, "grad_norm": 12.272015571594238, "learning_rate": 1.4084458222615885e-05, "logits/chosen": 0.9972988367080688, "logits/rejected": 0.9785882830619812, "logps/chosen": -197.4178009033203, "logps/rejected": -204.10533142089844, "loss": 0.595, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.3591177761554718, "rewards/margins": 0.4229649603366852, "rewards/rejected": -0.7820826768875122, "step": 560 }, { "epoch": 0.1682905225863596, "grad_norm": 10.449053764343262, "learning_rate": 1.4084180690876882e-05, "logits/chosen": 0.9894211888313293, "logits/rejected": 0.9773244857788086, "logps/chosen": -196.0491943359375, "logps/rejected": -202.52410888671875, "loss": 0.5901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7734084129333496, "rewards/margins": 0.42861872911453247, "rewards/rejected": -1.2020270824432373, "step": 570 }, { "epoch": 0.17124298789489223, "grad_norm": 10.703384399414062, "learning_rate": 1.4083903159137881e-05, "logits/chosen": 1.053659200668335, "logits/rejected": 1.0548425912857056, "logps/chosen": -202.9679718017578, "logps/rejected": -209.8126983642578, "loss": 0.6355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9700139760971069, "rewards/margins": 0.42744168639183044, "rewards/rejected": -1.3974555730819702, "step": 580 }, { "epoch": 0.17419545320342486, "grad_norm": 11.854584693908691, "learning_rate": 1.4083625627398878e-05, "logits/chosen": 1.1056914329528809, "logits/rejected": 1.09839928150177, "logps/chosen": -198.17489624023438, "logps/rejected": -204.50660705566406, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": -0.6704292297363281, "rewards/margins": 0.5699925422668457, "rewards/rejected": -1.2404216527938843, "step": 590 }, { "epoch": 0.1771479185119575, "grad_norm": 10.888213157653809, "learning_rate": 1.4083348095659877e-05, "logits/chosen": 1.0913571119308472, "logits/rejected": 1.1052333116531372, "logps/chosen": -204.839599609375, "logps/rejected": -207.6987762451172, "loss": 0.582, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.9262521862983704, "rewards/margins": 0.5031600594520569, "rewards/rejected": -1.4294121265411377, "step": 600 }, { "epoch": 0.18010038382049012, "grad_norm": 13.287003517150879, "learning_rate": 1.4083070563920876e-05, "logits/chosen": 0.7106370329856873, "logits/rejected": 0.70152747631073, "logps/chosen": -200.42105102539062, "logps/rejected": -207.2209930419922, "loss": 0.563, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.9491299390792847, "rewards/margins": 0.5422946214675903, "rewards/rejected": -1.4914246797561646, "step": 610 }, { "epoch": 0.18305284912902273, "grad_norm": 23.820791244506836, "learning_rate": 1.4082793032181873e-05, "logits/chosen": 1.1230837106704712, "logits/rejected": 1.107384443283081, "logps/chosen": -200.9855194091797, "logps/rejected": -209.40243530273438, "loss": 0.549, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.995921790599823, "rewards/margins": 0.6092842221260071, "rewards/rejected": -1.6052058935165405, "step": 620 }, { "epoch": 0.18600531443755536, "grad_norm": 19.415285110473633, "learning_rate": 1.408251550044287e-05, "logits/chosen": 1.1334234476089478, "logits/rejected": 1.1140673160552979, "logps/chosen": -200.0419464111328, "logps/rejected": -205.5067596435547, "loss": 0.5984, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -0.9103124737739563, "rewards/margins": 0.5052827596664429, "rewards/rejected": -1.4155951738357544, "step": 630 }, { "epoch": 0.188957779746088, "grad_norm": 9.42514705657959, "learning_rate": 1.4082237968703867e-05, "logits/chosen": 0.942944347858429, "logits/rejected": 0.932725727558136, "logps/chosen": -200.16371154785156, "logps/rejected": -198.94093322753906, "loss": 0.5465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5651094317436218, "rewards/margins": 0.5419740676879883, "rewards/rejected": -1.1070835590362549, "step": 640 }, { "epoch": 0.19191024505462062, "grad_norm": 6.785238265991211, "learning_rate": 1.4081960436964867e-05, "logits/chosen": 0.8338491320610046, "logits/rejected": 0.8452416658401489, "logps/chosen": -193.2548828125, "logps/rejected": -196.41104125976562, "loss": 0.5458, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.48633044958114624, "rewards/margins": 0.6316096186637878, "rewards/rejected": -1.1179401874542236, "step": 650 }, { "epoch": 0.19486271036315322, "grad_norm": 10.195595741271973, "learning_rate": 1.4081682905225864e-05, "logits/chosen": 0.9671465158462524, "logits/rejected": 0.9489065408706665, "logps/chosen": -196.73159790039062, "logps/rejected": -202.5388946533203, "loss": 0.5907, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.28963369131088257, "rewards/margins": 0.6467798948287964, "rewards/rejected": -0.9364136457443237, "step": 660 }, { "epoch": 0.19781517567168586, "grad_norm": 10.559146881103516, "learning_rate": 1.4081405373486861e-05, "logits/chosen": 1.1955444812774658, "logits/rejected": 1.1915665864944458, "logps/chosen": -194.31130981445312, "logps/rejected": -202.56417846679688, "loss": 0.6285, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.15417079627513885, "rewards/margins": 0.3329012393951416, "rewards/rejected": -0.48707205057144165, "step": 670 }, { "epoch": 0.2007676409802185, "grad_norm": 9.974637031555176, "learning_rate": 1.408112784174786e-05, "logits/chosen": 1.2191699743270874, "logits/rejected": 1.201953649520874, "logps/chosen": -196.7225341796875, "logps/rejected": -199.62205505371094, "loss": 0.6821, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.1129167228937149, "rewards/margins": 0.3532589077949524, "rewards/rejected": -0.46617555618286133, "step": 680 }, { "epoch": 0.20372010628875112, "grad_norm": 12.815214157104492, "learning_rate": 1.4080850310008859e-05, "logits/chosen": 1.0013329982757568, "logits/rejected": 0.9852606058120728, "logps/chosen": -194.14962768554688, "logps/rejected": -203.47850036621094, "loss": 0.6511, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.3127971291542053, "rewards/margins": 0.30729609727859497, "rewards/rejected": -0.6200932264328003, "step": 690 }, { "epoch": 0.20667257159728372, "grad_norm": 12.249149322509766, "learning_rate": 1.4080572778269856e-05, "logits/chosen": 1.2233773469924927, "logits/rejected": 1.2129541635513306, "logps/chosen": -193.9405517578125, "logps/rejected": -200.7108917236328, "loss": 0.5773, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.28493785858154297, "rewards/margins": 0.4443270266056061, "rewards/rejected": -0.7292648553848267, "step": 700 }, { "epoch": 0.20962503690581635, "grad_norm": 5.710248947143555, "learning_rate": 1.4080295246530853e-05, "logits/chosen": 0.8811189532279968, "logits/rejected": 0.8761937022209167, "logps/chosen": -196.33197021484375, "logps/rejected": -205.0043487548828, "loss": 0.443, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.4988059997558594, "rewards/margins": 0.9298487901687622, "rewards/rejected": -1.4286547899246216, "step": 710 }, { "epoch": 0.21257750221434898, "grad_norm": 5.529027938842773, "learning_rate": 1.4080017714791852e-05, "logits/chosen": 1.0330538749694824, "logits/rejected": 1.016880750656128, "logps/chosen": -210.85986328125, "logps/rejected": -214.50497436523438, "loss": 0.6194, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -1.6915702819824219, "rewards/margins": 0.468742698431015, "rewards/rejected": -2.160313129425049, "step": 720 }, { "epoch": 0.21552996752288162, "grad_norm": 5.890848636627197, "learning_rate": 1.407974018305285e-05, "logits/chosen": 0.5451382398605347, "logits/rejected": 0.5453681349754333, "logps/chosen": -208.1138458251953, "logps/rejected": -212.5973663330078, "loss": 0.6006, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -1.7641716003417969, "rewards/margins": 0.3806423842906952, "rewards/rejected": -2.1448140144348145, "step": 730 }, { "epoch": 0.21848243283141422, "grad_norm": 4.792769908905029, "learning_rate": 1.4079462651313847e-05, "logits/chosen": 0.989889919757843, "logits/rejected": 0.963244616985321, "logps/chosen": -212.3883514404297, "logps/rejected": -215.52810668945312, "loss": 0.5192, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -1.7187093496322632, "rewards/margins": 0.6268240213394165, "rewards/rejected": -2.3455333709716797, "step": 740 }, { "epoch": 0.22143489813994685, "grad_norm": 8.6510009765625, "learning_rate": 1.4079185119574845e-05, "logits/chosen": 0.7344129681587219, "logits/rejected": 0.7326828241348267, "logps/chosen": -210.7428436279297, "logps/rejected": -217.3068084716797, "loss": 0.5463, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -1.5295541286468506, "rewards/margins": 0.6142388582229614, "rewards/rejected": -2.1437928676605225, "step": 750 }, { "epoch": 0.22438736344847948, "grad_norm": 9.542976379394531, "learning_rate": 1.4078907587835843e-05, "logits/chosen": 1.1824983358383179, "logits/rejected": 1.1680859327316284, "logps/chosen": -208.3236846923828, "logps/rejected": -213.2423095703125, "loss": 0.5896, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -1.367111086845398, "rewards/margins": 0.619563102722168, "rewards/rejected": -1.9866743087768555, "step": 760 }, { "epoch": 0.2273398287570121, "grad_norm": 4.733201503753662, "learning_rate": 1.4078630056096842e-05, "logits/chosen": 1.1759603023529053, "logits/rejected": 1.1727043390274048, "logps/chosen": -199.46826171875, "logps/rejected": -207.35440063476562, "loss": 0.5045, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -0.7298592329025269, "rewards/margins": 0.8125484585762024, "rewards/rejected": -1.5424076318740845, "step": 770 }, { "epoch": 0.23029229406554472, "grad_norm": 7.356558799743652, "learning_rate": 1.4078352524357839e-05, "logits/chosen": 0.7975548505783081, "logits/rejected": 0.7792436480522156, "logps/chosen": -200.87474060058594, "logps/rejected": -211.59774780273438, "loss": 0.5265, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.8150293231010437, "rewards/margins": 0.7849806547164917, "rewards/rejected": -1.6000099182128906, "step": 780 }, { "epoch": 0.23324475937407735, "grad_norm": 8.332216262817383, "learning_rate": 1.4078074992618838e-05, "logits/chosen": 0.9103379249572754, "logits/rejected": 0.8947712779045105, "logps/chosen": -203.49302673339844, "logps/rejected": -210.7021484375, "loss": 0.4911, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.0461088418960571, "rewards/margins": 0.8985697627067566, "rewards/rejected": -1.9446786642074585, "step": 790 }, { "epoch": 0.23619722468260998, "grad_norm": 4.8086137771606445, "learning_rate": 1.4077797460879835e-05, "logits/chosen": 1.1948597431182861, "logits/rejected": 1.1884344816207886, "logps/chosen": -200.65707397460938, "logps/rejected": -207.1472625732422, "loss": 0.5924, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.8394579887390137, "rewards/margins": 0.6348735690116882, "rewards/rejected": -1.4743314981460571, "step": 800 }, { "epoch": 0.2391496899911426, "grad_norm": 8.950235366821289, "learning_rate": 1.4077519929140834e-05, "logits/chosen": 1.1350468397140503, "logits/rejected": 1.1287434101104736, "logps/chosen": -198.51931762695312, "logps/rejected": -205.2808380126953, "loss": 0.6015, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.47419366240501404, "rewards/margins": 0.6330995559692383, "rewards/rejected": -1.1072933673858643, "step": 810 }, { "epoch": 0.24210215529967524, "grad_norm": 9.155820846557617, "learning_rate": 1.407724239740183e-05, "logits/chosen": 1.0686590671539307, "logits/rejected": 1.068099856376648, "logps/chosen": -206.78536987304688, "logps/rejected": -212.2686767578125, "loss": 0.4983, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.7780241370201111, "rewards/margins": 0.7813541889190674, "rewards/rejected": -1.5593783855438232, "step": 820 }, { "epoch": 0.24505462060820785, "grad_norm": 7.940489768981934, "learning_rate": 1.407696486566283e-05, "logits/chosen": 1.005444884300232, "logits/rejected": 0.9800812005996704, "logps/chosen": -212.26083374023438, "logps/rejected": -221.26412963867188, "loss": 0.5596, "rewards/accuracies": 0.6833333969116211, "rewards/chosen": -1.6419061422348022, "rewards/margins": 0.5535315871238708, "rewards/rejected": -2.1954379081726074, "step": 830 }, { "epoch": 0.24800708591674048, "grad_norm": 11.276350021362305, "learning_rate": 1.4076687333923827e-05, "logits/chosen": 0.9107723236083984, "logits/rejected": 0.9086516499519348, "logps/chosen": -211.13070678710938, "logps/rejected": -211.6840057373047, "loss": 0.5372, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -1.5946826934814453, "rewards/margins": 0.676060140132904, "rewards/rejected": -2.270742893218994, "step": 840 }, { "epoch": 0.2509595512252731, "grad_norm": 6.618904113769531, "learning_rate": 1.4076409802184825e-05, "logits/chosen": 1.1765861511230469, "logits/rejected": 1.1829272508621216, "logps/chosen": -202.9951934814453, "logps/rejected": -211.509033203125, "loss": 0.5817, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -1.4554831981658936, "rewards/margins": 0.6993283033370972, "rewards/rejected": -2.1548118591308594, "step": 850 }, { "epoch": 0.25391201653380574, "grad_norm": 7.729018211364746, "learning_rate": 1.4076132270445822e-05, "logits/chosen": 0.7203906178474426, "logits/rejected": 0.7145841717720032, "logps/chosen": -209.3985595703125, "logps/rejected": -219.42257690429688, "loss": 0.5081, "rewards/accuracies": 0.7500001192092896, "rewards/chosen": -1.245046615600586, "rewards/margins": 0.950055718421936, "rewards/rejected": -2.1951022148132324, "step": 860 }, { "epoch": 0.25686448184233834, "grad_norm": 6.480947971343994, "learning_rate": 1.4075854738706821e-05, "logits/chosen": 0.8916576504707336, "logits/rejected": 0.8874212503433228, "logps/chosen": -197.1162872314453, "logps/rejected": -206.9589080810547, "loss": 0.4501, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -0.8906022906303406, "rewards/margins": 0.8994898796081543, "rewards/rejected": -1.7900922298431396, "step": 870 }, { "epoch": 0.259816947150871, "grad_norm": 6.913644313812256, "learning_rate": 1.4075577206967818e-05, "logits/chosen": 1.1063673496246338, "logits/rejected": 1.0881317853927612, "logps/chosen": -208.00424194335938, "logps/rejected": -213.5292205810547, "loss": 0.5736, "rewards/accuracies": 0.6833332777023315, "rewards/chosen": -1.2689714431762695, "rewards/margins": 0.7586643099784851, "rewards/rejected": -2.0276355743408203, "step": 880 }, { "epoch": 0.2627694124594036, "grad_norm": 9.831883430480957, "learning_rate": 1.4075299675228817e-05, "logits/chosen": 0.8115822076797485, "logits/rejected": 0.7997537851333618, "logps/chosen": -205.03759765625, "logps/rejected": -215.96548461914062, "loss": 0.6068, "rewards/accuracies": 0.716666579246521, "rewards/chosen": -1.7441415786743164, "rewards/margins": 0.7121504545211792, "rewards/rejected": -2.456291913986206, "step": 890 }, { "epoch": 0.2657218777679362, "grad_norm": 15.932978630065918, "learning_rate": 1.4075022143489814e-05, "logits/chosen": 1.4613250494003296, "logits/rejected": 1.4576307535171509, "logps/chosen": -209.42648315429688, "logps/rejected": -220.22607421875, "loss": 0.6165, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.090859889984131, "rewards/margins": 0.5428851246833801, "rewards/rejected": -2.633744716644287, "step": 900 }, { "epoch": 0.26867434307646887, "grad_norm": 7.22115421295166, "learning_rate": 1.4074744611750813e-05, "logits/chosen": 1.1471521854400635, "logits/rejected": 1.1732441186904907, "logps/chosen": -211.0574188232422, "logps/rejected": -218.68807983398438, "loss": 0.4691, "rewards/accuracies": 0.75, "rewards/chosen": -1.4584523439407349, "rewards/margins": 0.9976604580879211, "rewards/rejected": -2.456112861633301, "step": 910 }, { "epoch": 0.2716268083850015, "grad_norm": 10.624491691589355, "learning_rate": 1.407446708001181e-05, "logits/chosen": 1.1711971759796143, "logits/rejected": 1.1646883487701416, "logps/chosen": -211.31753540039062, "logps/rejected": -221.79580688476562, "loss": 0.4922, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -1.8762410879135132, "rewards/margins": 0.9315407872200012, "rewards/rejected": -2.807781934738159, "step": 920 }, { "epoch": 0.2745792736935341, "grad_norm": 7.527136325836182, "learning_rate": 1.4074189548272809e-05, "logits/chosen": 0.6628230214118958, "logits/rejected": 0.6545461416244507, "logps/chosen": -213.47592163085938, "logps/rejected": -219.8159637451172, "loss": 0.6115, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -2.248460054397583, "rewards/margins": 0.7793260812759399, "rewards/rejected": -3.0277862548828125, "step": 930 }, { "epoch": 0.27753173900206674, "grad_norm": 3.208237409591675, "learning_rate": 1.4073912016533807e-05, "logits/chosen": 1.0598645210266113, "logits/rejected": 1.0432993173599243, "logps/chosen": -209.75564575195312, "logps/rejected": -218.7815704345703, "loss": 0.605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.002910852432251, "rewards/margins": 0.773485004901886, "rewards/rejected": -2.7763960361480713, "step": 940 }, { "epoch": 0.28048420431059934, "grad_norm": 17.750818252563477, "learning_rate": 1.4073634484794804e-05, "logits/chosen": 1.1378564834594727, "logits/rejected": 1.132851481437683, "logps/chosen": -215.0715789794922, "logps/rejected": -220.6090545654297, "loss": 0.5407, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -2.0894715785980225, "rewards/margins": 0.7593616247177124, "rewards/rejected": -2.8488330841064453, "step": 950 }, { "epoch": 0.283436669619132, "grad_norm": 7.516430377960205, "learning_rate": 1.4073356953055802e-05, "logits/chosen": 0.9056533575057983, "logits/rejected": 0.8890334963798523, "logps/chosen": -209.2738800048828, "logps/rejected": -217.39346313476562, "loss": 0.496, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -1.1526466608047485, "rewards/margins": 0.9390742182731628, "rewards/rejected": -2.0917210578918457, "step": 960 }, { "epoch": 0.2863891349276646, "grad_norm": 7.022889614105225, "learning_rate": 1.4073079421316799e-05, "logits/chosen": 0.7717276811599731, "logits/rejected": 0.7703901529312134, "logps/chosen": -202.55868530273438, "logps/rejected": -208.3015899658203, "loss": 0.6059, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -1.2014471292495728, "rewards/margins": 0.685221791267395, "rewards/rejected": -1.8866688013076782, "step": 970 }, { "epoch": 0.2893416002361972, "grad_norm": 14.980413436889648, "learning_rate": 1.4072801889577799e-05, "logits/chosen": 1.0208542346954346, "logits/rejected": 1.0085170269012451, "logps/chosen": -197.9685516357422, "logps/rejected": -210.98629760742188, "loss": 0.6272, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.9504068493843079, "rewards/margins": 0.6253520250320435, "rewards/rejected": -1.575758695602417, "step": 980 }, { "epoch": 0.29229406554472986, "grad_norm": 7.794418811798096, "learning_rate": 1.4072524357838796e-05, "logits/chosen": 1.0170702934265137, "logits/rejected": 1.0216273069381714, "logps/chosen": -198.47373962402344, "logps/rejected": -209.2837371826172, "loss": 0.5804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9993124008178711, "rewards/margins": 0.5822805166244507, "rewards/rejected": -1.5815930366516113, "step": 990 }, { "epoch": 0.29524653085326247, "grad_norm": 11.953492164611816, "learning_rate": 1.4072246826099793e-05, "logits/chosen": 1.1830928325653076, "logits/rejected": 1.1834582090377808, "logps/chosen": -202.09329223632812, "logps/rejected": -208.96224975585938, "loss": 0.6322, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1467962265014648, "rewards/margins": 0.451543390750885, "rewards/rejected": -1.5983396768569946, "step": 1000 }, { "epoch": 0.29819899616179507, "grad_norm": 12.079113960266113, "learning_rate": 1.4071969294360792e-05, "logits/chosen": 1.068080186843872, "logits/rejected": 1.0618536472320557, "logps/chosen": -202.75405883789062, "logps/rejected": -209.6384735107422, "loss": 0.5319, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -1.1548467874526978, "rewards/margins": 0.7879319787025452, "rewards/rejected": -1.9427785873413086, "step": 1010 }, { "epoch": 0.30115146147032773, "grad_norm": 6.433148384094238, "learning_rate": 1.407169176262179e-05, "logits/chosen": 1.0946038961410522, "logits/rejected": 1.0803263187408447, "logps/chosen": -201.4412841796875, "logps/rejected": -210.1566162109375, "loss": 0.5863, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -1.0016964673995972, "rewards/margins": 0.5949385166168213, "rewards/rejected": -1.596635103225708, "step": 1020 }, { "epoch": 0.30410392677886033, "grad_norm": 4.3994526863098145, "learning_rate": 1.4071414230882788e-05, "logits/chosen": 1.2642898559570312, "logits/rejected": 1.2663602828979492, "logps/chosen": -202.36788940429688, "logps/rejected": -212.39083862304688, "loss": 0.4693, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.9236468076705933, "rewards/margins": 1.0039184093475342, "rewards/rejected": -1.9275649785995483, "step": 1030 }, { "epoch": 0.307056392087393, "grad_norm": 10.424783706665039, "learning_rate": 1.4071136699143785e-05, "logits/chosen": 0.9276378750801086, "logits/rejected": 0.9241577386856079, "logps/chosen": -207.08981323242188, "logps/rejected": -213.34823608398438, "loss": 0.5518, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -1.4125430583953857, "rewards/margins": 0.8704258799552917, "rewards/rejected": -2.2829689979553223, "step": 1040 }, { "epoch": 0.3100088573959256, "grad_norm": 10.45077896118164, "learning_rate": 1.4070859167404784e-05, "logits/chosen": 0.8593951463699341, "logits/rejected": 0.8654651641845703, "logps/chosen": -213.75851440429688, "logps/rejected": -222.3131103515625, "loss": 0.5619, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -1.7373164892196655, "rewards/margins": 0.8900936245918274, "rewards/rejected": -2.627410411834717, "step": 1050 }, { "epoch": 0.3129613227044582, "grad_norm": 6.611659049987793, "learning_rate": 1.4070581635665782e-05, "logits/chosen": 1.1823264360427856, "logits/rejected": 1.1701545715332031, "logps/chosen": -205.524169921875, "logps/rejected": -219.41641235351562, "loss": 0.4416, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -1.8381435871124268, "rewards/margins": 1.3411756753921509, "rewards/rejected": -3.179318904876709, "step": 1060 }, { "epoch": 0.31591378801299086, "grad_norm": 13.64857292175293, "learning_rate": 1.407030410392678e-05, "logits/chosen": 1.0460575819015503, "logits/rejected": 1.0331449508666992, "logps/chosen": -216.8398895263672, "logps/rejected": -229.5764617919922, "loss": 0.4547, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -2.5006532669067383, "rewards/margins": 1.3229708671569824, "rewards/rejected": -3.8236243724823, "step": 1070 }, { "epoch": 0.31886625332152346, "grad_norm": 6.310172080993652, "learning_rate": 1.4070026572187776e-05, "logits/chosen": 0.9639976620674133, "logits/rejected": 0.9538136720657349, "logps/chosen": -223.3867645263672, "logps/rejected": -234.15139770507812, "loss": 0.7282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3546454906463623, "rewards/margins": 0.6628521680831909, "rewards/rejected": -4.017497539520264, "step": 1080 }, { "epoch": 0.3218187186300561, "grad_norm": 13.645084381103516, "learning_rate": 1.4069749040448775e-05, "logits/chosen": 1.029260277748108, "logits/rejected": 1.02008855342865, "logps/chosen": -221.03662109375, "logps/rejected": -226.8896942138672, "loss": 0.5271, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -2.882402181625366, "rewards/margins": 0.9087856411933899, "rewards/rejected": -3.7911880016326904, "step": 1090 }, { "epoch": 0.3247711839385887, "grad_norm": 22.42061996459961, "learning_rate": 1.4069471508709774e-05, "logits/chosen": 0.8922605514526367, "logits/rejected": 0.8998235464096069, "logps/chosen": -205.65554809570312, "logps/rejected": -220.8451690673828, "loss": 0.4934, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -1.8904451131820679, "rewards/margins": 1.1780626773834229, "rewards/rejected": -3.0685081481933594, "step": 1100 }, { "epoch": 0.32772364924712133, "grad_norm": 6.300334930419922, "learning_rate": 1.4069193976970771e-05, "logits/chosen": 1.2597699165344238, "logits/rejected": 1.2554943561553955, "logps/chosen": -210.4364471435547, "logps/rejected": -221.440673828125, "loss": 0.4471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5781030654907227, "rewards/margins": 1.230252981185913, "rewards/rejected": -2.8083558082580566, "step": 1110 }, { "epoch": 0.330676114555654, "grad_norm": 10.440203666687012, "learning_rate": 1.4068916445231768e-05, "logits/chosen": 1.1553164720535278, "logits/rejected": 1.1412006616592407, "logps/chosen": -213.39157104492188, "logps/rejected": -221.9605712890625, "loss": 0.6089, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -1.8427482843399048, "rewards/margins": 0.6788046360015869, "rewards/rejected": -2.521552801132202, "step": 1120 }, { "epoch": 0.3336285798641866, "grad_norm": 7.161910533905029, "learning_rate": 1.4068638913492767e-05, "logits/chosen": 1.1661531925201416, "logits/rejected": 1.1647756099700928, "logps/chosen": -210.96786499023438, "logps/rejected": -220.00601196289062, "loss": 0.5272, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": -1.6045238971710205, "rewards/margins": 0.9415479898452759, "rewards/rejected": -2.546072006225586, "step": 1130 }, { "epoch": 0.3365810451727192, "grad_norm": 7.390961170196533, "learning_rate": 1.4068361381753766e-05, "logits/chosen": 1.281453251838684, "logits/rejected": 1.2810918092727661, "logps/chosen": -214.9879150390625, "logps/rejected": -225.9520263671875, "loss": 0.4832, "rewards/accuracies": 0.75, "rewards/chosen": -1.7760931253433228, "rewards/margins": 1.2280247211456299, "rewards/rejected": -3.004117488861084, "step": 1140 }, { "epoch": 0.33953351048125185, "grad_norm": 12.220233917236328, "learning_rate": 1.4068083850014763e-05, "logits/chosen": 0.9349085688591003, "logits/rejected": 0.9172965288162231, "logps/chosen": -213.7171630859375, "logps/rejected": -227.9143524169922, "loss": 0.4807, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -2.2655694484710693, "rewards/margins": 1.2999430894851685, "rewards/rejected": -3.565512180328369, "step": 1150 }, { "epoch": 0.34248597578978446, "grad_norm": 5.238320350646973, "learning_rate": 1.4067806318275761e-05, "logits/chosen": 0.9835092425346375, "logits/rejected": 0.981969952583313, "logps/chosen": -212.0533905029297, "logps/rejected": -226.9181671142578, "loss": 0.5062, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -1.6351778507232666, "rewards/margins": 1.1791164875030518, "rewards/rejected": -2.8142943382263184, "step": 1160 }, { "epoch": 0.3454384410983171, "grad_norm": 12.692456245422363, "learning_rate": 1.4067528786536758e-05, "logits/chosen": 1.2191753387451172, "logits/rejected": 1.205960750579834, "logps/chosen": -218.3784942626953, "logps/rejected": -225.16543579101562, "loss": 0.5818, "rewards/accuracies": 0.6833333969116211, "rewards/chosen": -2.7444846630096436, "rewards/margins": 0.8870908617973328, "rewards/rejected": -3.6315758228302, "step": 1170 }, { "epoch": 0.3483909064068497, "grad_norm": 14.48228931427002, "learning_rate": 1.4067251254797757e-05, "logits/chosen": 0.9090206027030945, "logits/rejected": 0.9131497144699097, "logps/chosen": -213.85873413085938, "logps/rejected": -230.63619995117188, "loss": 0.4687, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -2.365603446960449, "rewards/margins": 1.2593224048614502, "rewards/rejected": -3.6249260902404785, "step": 1180 }, { "epoch": 0.3513433717153823, "grad_norm": 4.434173583984375, "learning_rate": 1.4066973723058754e-05, "logits/chosen": 1.1928646564483643, "logits/rejected": 1.1924841403961182, "logps/chosen": -212.88558959960938, "logps/rejected": -221.09048461914062, "loss": 0.5615, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -1.8671756982803345, "rewards/margins": 0.8395135998725891, "rewards/rejected": -2.7066893577575684, "step": 1190 }, { "epoch": 0.354295837023915, "grad_norm": 18.022241592407227, "learning_rate": 1.4066696191319753e-05, "logits/chosen": 1.0654301643371582, "logits/rejected": 1.067916750907898, "logps/chosen": -215.24935913085938, "logps/rejected": -226.2921142578125, "loss": 0.5136, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -2.1677470207214355, "rewards/margins": 1.1760331392288208, "rewards/rejected": -3.343780517578125, "step": 1200 }, { "epoch": 0.3572483023324476, "grad_norm": 13.5853853225708, "learning_rate": 1.406641865958075e-05, "logits/chosen": 1.0499533414840698, "logits/rejected": 1.0321810245513916, "logps/chosen": -207.63290405273438, "logps/rejected": -223.20449829101562, "loss": 0.5076, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -2.002998113632202, "rewards/margins": 1.3239938020706177, "rewards/rejected": -3.3269920349121094, "step": 1210 }, { "epoch": 0.36020076764098025, "grad_norm": 12.515037536621094, "learning_rate": 1.4066141127841749e-05, "logits/chosen": 0.8488761782646179, "logits/rejected": 0.8283529281616211, "logps/chosen": -219.1075439453125, "logps/rejected": -230.21408081054688, "loss": 0.5009, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -2.4314680099487305, "rewards/margins": 1.2061161994934082, "rewards/rejected": -3.6375839710235596, "step": 1220 }, { "epoch": 0.36315323294951285, "grad_norm": 17.978681564331055, "learning_rate": 1.4065863596102746e-05, "logits/chosen": 1.0363500118255615, "logits/rejected": 1.0177322626113892, "logps/chosen": -214.47000122070312, "logps/rejected": -222.55172729492188, "loss": 0.6395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1304965019226074, "rewards/margins": 0.8203921318054199, "rewards/rejected": -2.9508886337280273, "step": 1230 }, { "epoch": 0.36610569825804545, "grad_norm": 12.131927490234375, "learning_rate": 1.4065586064363745e-05, "logits/chosen": 1.117382526397705, "logits/rejected": 1.1108427047729492, "logps/chosen": -205.79342651367188, "logps/rejected": -216.9019012451172, "loss": 0.5211, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -1.582242727279663, "rewards/margins": 0.996538519859314, "rewards/rejected": -2.5787813663482666, "step": 1240 }, { "epoch": 0.3690581635665781, "grad_norm": 7.095322132110596, "learning_rate": 1.4065308532624742e-05, "logits/chosen": 1.2670257091522217, "logits/rejected": 1.2542906999588013, "logps/chosen": -206.429931640625, "logps/rejected": -218.08154296875, "loss": 0.4608, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.6655343770980835, "rewards/margins": 1.0855953693389893, "rewards/rejected": -2.751129627227783, "step": 1250 }, { "epoch": 0.3720106288751107, "grad_norm": 6.73194694519043, "learning_rate": 1.4065031000885739e-05, "logits/chosen": 1.0509501695632935, "logits/rejected": 1.0203218460083008, "logps/chosen": -214.21664428710938, "logps/rejected": -236.8522186279297, "loss": 0.4465, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -2.442366600036621, "rewards/margins": 1.3232686519622803, "rewards/rejected": -3.7656352519989014, "step": 1260 }, { "epoch": 0.3749630941836433, "grad_norm": 5.488243103027344, "learning_rate": 1.406475346914674e-05, "logits/chosen": 0.6406041979789734, "logits/rejected": 0.6522623300552368, "logps/chosen": -218.31088256835938, "logps/rejected": -224.12832641601562, "loss": 0.6012, "rewards/accuracies": 0.6833333969116211, "rewards/chosen": -2.726771116256714, "rewards/margins": 0.8366702198982239, "rewards/rejected": -3.563441038131714, "step": 1270 }, { "epoch": 0.377915559492176, "grad_norm": 11.03882884979248, "learning_rate": 1.4064475937407736e-05, "logits/chosen": 0.9078397750854492, "logits/rejected": 0.9125596880912781, "logps/chosen": -219.7733917236328, "logps/rejected": -225.71435546875, "loss": 0.515, "rewards/accuracies": 0.75, "rewards/chosen": -2.4795327186584473, "rewards/margins": 1.0946376323699951, "rewards/rejected": -3.5741703510284424, "step": 1280 }, { "epoch": 0.3808680248007086, "grad_norm": 19.4429874420166, "learning_rate": 1.4064198405668733e-05, "logits/chosen": 0.9455856084823608, "logits/rejected": 0.9508755803108215, "logps/chosen": -222.911376953125, "logps/rejected": -227.7622528076172, "loss": 0.7851, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -2.9422149658203125, "rewards/margins": 0.4392127990722656, "rewards/rejected": -3.3814282417297363, "step": 1290 }, { "epoch": 0.38382049010924124, "grad_norm": 11.825089454650879, "learning_rate": 1.406392087392973e-05, "logits/chosen": 1.2353079319000244, "logits/rejected": 1.212390661239624, "logps/chosen": -220.1291046142578, "logps/rejected": -226.5579833984375, "loss": 0.4594, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.6262495517730713, "rewards/margins": 1.1487022638320923, "rewards/rejected": -3.774951934814453, "step": 1300 }, { "epoch": 0.38677295541777384, "grad_norm": 10.367622375488281, "learning_rate": 1.4063643342190731e-05, "logits/chosen": 0.8857883214950562, "logits/rejected": 0.8881596326828003, "logps/chosen": -218.17373657226562, "logps/rejected": -229.5813446044922, "loss": 0.5225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6012635231018066, "rewards/margins": 1.0460158586502075, "rewards/rejected": -3.6472792625427246, "step": 1310 }, { "epoch": 0.38972542072630645, "grad_norm": 13.139322280883789, "learning_rate": 1.4063365810451728e-05, "logits/chosen": 1.0150176286697388, "logits/rejected": 1.012414813041687, "logps/chosen": -223.9635467529297, "logps/rejected": -233.0707550048828, "loss": 0.6391, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -2.7967591285705566, "rewards/margins": 0.8449796438217163, "rewards/rejected": -3.6417384147644043, "step": 1320 }, { "epoch": 0.3926778860348391, "grad_norm": 8.575380325317383, "learning_rate": 1.4063088278712725e-05, "logits/chosen": 0.9643731117248535, "logits/rejected": 0.9640272855758667, "logps/chosen": -216.61569213867188, "logps/rejected": -226.29232788085938, "loss": 0.456, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -2.7340173721313477, "rewards/margins": 0.9833616018295288, "rewards/rejected": -3.717378616333008, "step": 1330 }, { "epoch": 0.3956303513433717, "grad_norm": 12.066393852233887, "learning_rate": 1.4062810746973724e-05, "logits/chosen": 1.039768099784851, "logits/rejected": 1.0218461751937866, "logps/chosen": -219.64016723632812, "logps/rejected": -232.13461303710938, "loss": 0.5125, "rewards/accuracies": 0.75, "rewards/chosen": -2.5856773853302, "rewards/margins": 1.1847783327102661, "rewards/rejected": -3.7704551219940186, "step": 1340 }, { "epoch": 0.3985828166519043, "grad_norm": 9.88737678527832, "learning_rate": 1.4062533215234723e-05, "logits/chosen": 0.9569822549819946, "logits/rejected": 0.9541361927986145, "logps/chosen": -214.03048706054688, "logps/rejected": -225.05514526367188, "loss": 0.6239, "rewards/accuracies": 0.7166667580604553, "rewards/chosen": -2.193084955215454, "rewards/margins": 1.0357897281646729, "rewards/rejected": -3.228874683380127, "step": 1350 }, { "epoch": 0.401535281960437, "grad_norm": 4.778758525848389, "learning_rate": 1.406225568349572e-05, "logits/chosen": 1.250440001487732, "logits/rejected": 1.2278730869293213, "logps/chosen": -214.63095092773438, "logps/rejected": -228.7228546142578, "loss": 0.4543, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2.260830879211426, "rewards/margins": 1.3212037086486816, "rewards/rejected": -3.5820343494415283, "step": 1360 }, { "epoch": 0.4044877472689696, "grad_norm": 12.979426383972168, "learning_rate": 1.4061978151756717e-05, "logits/chosen": 1.0435928106307983, "logits/rejected": 1.0178711414337158, "logps/chosen": -212.8032684326172, "logps/rejected": -227.16812133789062, "loss": 0.4955, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -2.2426681518554688, "rewards/margins": 1.1579865217208862, "rewards/rejected": -3.4006550312042236, "step": 1370 }, { "epoch": 0.40744021257750224, "grad_norm": 6.490755558013916, "learning_rate": 1.4061700620017715e-05, "logits/chosen": 0.8000830411911011, "logits/rejected": 0.7795848846435547, "logps/chosen": -208.5902557373047, "logps/rejected": -223.40097045898438, "loss": 0.5783, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -1.8605737686157227, "rewards/margins": 1.166029691696167, "rewards/rejected": -3.0266032218933105, "step": 1380 }, { "epoch": 0.41039267788603484, "grad_norm": 9.952801704406738, "learning_rate": 1.4061423088278714e-05, "logits/chosen": 0.9873785972595215, "logits/rejected": 0.9839811325073242, "logps/chosen": -217.22036743164062, "logps/rejected": -228.490478515625, "loss": 0.5222, "rewards/accuracies": 0.75, "rewards/chosen": -2.090999126434326, "rewards/margins": 1.1178557872772217, "rewards/rejected": -3.2088546752929688, "step": 1390 }, { "epoch": 0.41334514319456744, "grad_norm": 12.614864349365234, "learning_rate": 1.4061145556539711e-05, "logits/chosen": 1.0705889463424683, "logits/rejected": 1.0683691501617432, "logps/chosen": -214.81668090820312, "logps/rejected": -222.8192901611328, "loss": 0.5642, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -1.9317257404327393, "rewards/margins": 0.7571130394935608, "rewards/rejected": -2.688838481903076, "step": 1400 }, { "epoch": 0.4162976085031001, "grad_norm": 7.199221611022949, "learning_rate": 1.4060868024800708e-05, "logits/chosen": 1.1778976917266846, "logits/rejected": 1.1457722187042236, "logps/chosen": -208.38607788085938, "logps/rejected": -227.04757690429688, "loss": 0.417, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -1.9460535049438477, "rewards/margins": 1.3562421798706055, "rewards/rejected": -3.302295684814453, "step": 1410 }, { "epoch": 0.4192500738116327, "grad_norm": 13.090044975280762, "learning_rate": 1.4060590493061707e-05, "logits/chosen": 0.8216427564620972, "logits/rejected": 0.8223376274108887, "logps/chosen": -216.40902709960938, "logps/rejected": -225.3256072998047, "loss": 0.476, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.3372044563293457, "rewards/margins": 1.2104870080947876, "rewards/rejected": -3.5476908683776855, "step": 1420 }, { "epoch": 0.42220253912016537, "grad_norm": 10.719206809997559, "learning_rate": 1.4060312961322706e-05, "logits/chosen": 0.8986299633979797, "logits/rejected": 0.8667129278182983, "logps/chosen": -214.3195343017578, "logps/rejected": -229.7778778076172, "loss": 0.4738, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -2.460655689239502, "rewards/margins": 1.2484391927719116, "rewards/rejected": -3.709095001220703, "step": 1430 }, { "epoch": 0.42515500442869797, "grad_norm": 9.081042289733887, "learning_rate": 1.4060035429583703e-05, "logits/chosen": 1.1788232326507568, "logits/rejected": 1.1589388847351074, "logps/chosen": -219.0507354736328, "logps/rejected": -233.44772338867188, "loss": 0.4012, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.56376314163208, "rewards/margins": 1.1945992708206177, "rewards/rejected": -3.7583625316619873, "step": 1440 }, { "epoch": 0.4281074697372306, "grad_norm": 20.65030860900879, "learning_rate": 1.40597578978447e-05, "logits/chosen": 0.7724562883377075, "logits/rejected": 0.7638182640075684, "logps/chosen": -214.2189483642578, "logps/rejected": -231.114501953125, "loss": 0.5634, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -3.0367889404296875, "rewards/margins": 0.968031108379364, "rewards/rejected": -4.004820346832275, "step": 1450 }, { "epoch": 0.43105993504576323, "grad_norm": 7.0193047523498535, "learning_rate": 1.4059480366105699e-05, "logits/chosen": 1.0599548816680908, "logits/rejected": 1.0459741353988647, "logps/chosen": -211.6204376220703, "logps/rejected": -225.19815063476562, "loss": 0.3962, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -2.284773588180542, "rewards/margins": 1.3477522134780884, "rewards/rejected": -3.63252592086792, "step": 1460 }, { "epoch": 0.43401240035429584, "grad_norm": 7.337431907653809, "learning_rate": 1.4059202834366697e-05, "logits/chosen": 0.9562395215034485, "logits/rejected": 0.9478033781051636, "logps/chosen": -213.0074005126953, "logps/rejected": -230.5025634765625, "loss": 0.4603, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -2.2838051319122314, "rewards/margins": 1.278007984161377, "rewards/rejected": -3.5618128776550293, "step": 1470 }, { "epoch": 0.43696486566282844, "grad_norm": 11.480175971984863, "learning_rate": 1.4058925302627694e-05, "logits/chosen": 1.180159330368042, "logits/rejected": 1.1503002643585205, "logps/chosen": -216.0004119873047, "logps/rejected": -226.7790985107422, "loss": 0.5327, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.241337299346924, "rewards/margins": 1.2220370769500732, "rewards/rejected": -3.463374376296997, "step": 1480 }, { "epoch": 0.4399173309713611, "grad_norm": 10.498906135559082, "learning_rate": 1.4058647770888693e-05, "logits/chosen": 0.9739218950271606, "logits/rejected": 0.9440935254096985, "logps/chosen": -214.2965087890625, "logps/rejected": -231.44180297851562, "loss": 0.4614, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -2.266826868057251, "rewards/margins": 1.395552158355713, "rewards/rejected": -3.6623787879943848, "step": 1490 }, { "epoch": 0.4428697962798937, "grad_norm": 6.330338478088379, "learning_rate": 1.405837023914969e-05, "logits/chosen": 0.9716413617134094, "logits/rejected": 0.960383415222168, "logps/chosen": -220.914306640625, "logps/rejected": -231.07736206054688, "loss": 0.4452, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -2.8135244846343994, "rewards/margins": 1.2335678339004517, "rewards/rejected": -4.047092437744141, "step": 1500 }, { "epoch": 0.44582226158842636, "grad_norm": 8.765600204467773, "learning_rate": 1.4058092707410689e-05, "logits/chosen": 1.1714890003204346, "logits/rejected": 1.129003643989563, "logps/chosen": -226.14871215820312, "logps/rejected": -244.5494384765625, "loss": 0.3468, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -3.4796619415283203, "rewards/margins": 1.697302222251892, "rewards/rejected": -5.176963806152344, "step": 1510 }, { "epoch": 0.44877472689695896, "grad_norm": 4.537236213684082, "learning_rate": 1.4057815175671686e-05, "logits/chosen": 1.0341883897781372, "logits/rejected": 1.017451286315918, "logps/chosen": -231.5845947265625, "logps/rejected": -242.3070831298828, "loss": 0.483, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -3.893104076385498, "rewards/margins": 1.3605669736862183, "rewards/rejected": -5.253671169281006, "step": 1520 }, { "epoch": 0.45172719220549157, "grad_norm": 31.776721954345703, "learning_rate": 1.4057537643932685e-05, "logits/chosen": 0.940933883190155, "logits/rejected": 0.930675208568573, "logps/chosen": -221.7569122314453, "logps/rejected": -237.82095336914062, "loss": 0.5911, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.082986831665039, "rewards/margins": 1.356749415397644, "rewards/rejected": -4.439736366271973, "step": 1530 }, { "epoch": 0.4546796575140242, "grad_norm": 4.614811897277832, "learning_rate": 1.4057260112193682e-05, "logits/chosen": 1.1853039264678955, "logits/rejected": 1.1743733882904053, "logps/chosen": -215.8589324951172, "logps/rejected": -224.66635131835938, "loss": 0.482, "rewards/accuracies": 0.75, "rewards/chosen": -2.3534252643585205, "rewards/margins": 1.2282087802886963, "rewards/rejected": -3.581634044647217, "step": 1540 }, { "epoch": 0.45763212282255683, "grad_norm": 12.026450157165527, "learning_rate": 1.405698258045468e-05, "logits/chosen": 1.1335337162017822, "logits/rejected": 1.1130822896957397, "logps/chosen": -211.6738739013672, "logps/rejected": -223.2755584716797, "loss": 0.545, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -1.9970706701278687, "rewards/margins": 1.162044644355774, "rewards/rejected": -3.1591153144836426, "step": 1550 }, { "epoch": 0.46058458813108943, "grad_norm": 10.953782081604004, "learning_rate": 1.4056705048715678e-05, "logits/chosen": 1.2471749782562256, "logits/rejected": 1.235811710357666, "logps/chosen": -224.13699340820312, "logps/rejected": -240.31423950195312, "loss": 0.394, "rewards/accuracies": 0.8166667819023132, "rewards/chosen": -3.005976438522339, "rewards/margins": 1.6897964477539062, "rewards/rejected": -4.695773124694824, "step": 1560 }, { "epoch": 0.4635370534396221, "grad_norm": 6.3064703941345215, "learning_rate": 1.4056427516976677e-05, "logits/chosen": 0.8881756067276001, "logits/rejected": 0.8785011172294617, "logps/chosen": -224.7781982421875, "logps/rejected": -236.4688720703125, "loss": 0.455, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3.1409261226654053, "rewards/margins": 1.4734103679656982, "rewards/rejected": -4.6143364906311035, "step": 1570 }, { "epoch": 0.4664895187481547, "grad_norm": 10.286478042602539, "learning_rate": 1.4056149985237674e-05, "logits/chosen": 1.1110920906066895, "logits/rejected": 1.0957175493240356, "logps/chosen": -213.31747436523438, "logps/rejected": -230.9911346435547, "loss": 0.4091, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2.3877875804901123, "rewards/margins": 1.6544755697250366, "rewards/rejected": -4.042263031005859, "step": 1580 }, { "epoch": 0.46944198405668736, "grad_norm": 13.790908813476562, "learning_rate": 1.405587245349867e-05, "logits/chosen": 1.4784047603607178, "logits/rejected": 1.4531900882720947, "logps/chosen": -216.329833984375, "logps/rejected": -235.14633178710938, "loss": 0.4687, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -2.485625743865967, "rewards/margins": 1.421156883239746, "rewards/rejected": -3.906782865524292, "step": 1590 }, { "epoch": 0.47239444936521996, "grad_norm": 12.51675796508789, "learning_rate": 1.4055594921759671e-05, "logits/chosen": 0.6222361326217651, "logits/rejected": 0.6146097779273987, "logps/chosen": -222.75265502929688, "logps/rejected": -235.95382690429688, "loss": 0.4627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4988269805908203, "rewards/margins": 1.1688191890716553, "rewards/rejected": -4.667646408081055, "step": 1600 }, { "epoch": 0.47534691467375256, "grad_norm": 15.598734855651855, "learning_rate": 1.4055317390020668e-05, "logits/chosen": 1.1921558380126953, "logits/rejected": 1.1806530952453613, "logps/chosen": -226.09573364257812, "logps/rejected": -238.32589721679688, "loss": 0.6108, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3.4581730365753174, "rewards/margins": 1.052140474319458, "rewards/rejected": -4.510313034057617, "step": 1610 }, { "epoch": 0.4782993799822852, "grad_norm": 7.823555946350098, "learning_rate": 1.4055039858281665e-05, "logits/chosen": 1.2592370510101318, "logits/rejected": 1.2386689186096191, "logps/chosen": -224.85568237304688, "logps/rejected": -235.06787109375, "loss": 0.554, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -2.9340662956237793, "rewards/margins": 0.8756451606750488, "rewards/rejected": -3.8097119331359863, "step": 1620 }, { "epoch": 0.4812518452908178, "grad_norm": 9.409639358520508, "learning_rate": 1.4054762326542662e-05, "logits/chosen": 1.058607816696167, "logits/rejected": 1.0380381345748901, "logps/chosen": -225.41702270507812, "logps/rejected": -229.32992553710938, "loss": 0.509, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.7302823066711426, "rewards/margins": 1.0809227228164673, "rewards/rejected": -3.8112049102783203, "step": 1630 }, { "epoch": 0.4842043105993505, "grad_norm": 15.414485931396484, "learning_rate": 1.4054484794803663e-05, "logits/chosen": 1.1270025968551636, "logits/rejected": 1.107818365097046, "logps/chosen": -219.19644165039062, "logps/rejected": -231.5637664794922, "loss": 0.495, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -2.28462815284729, "rewards/margins": 1.3447675704956055, "rewards/rejected": -3.6293957233428955, "step": 1640 }, { "epoch": 0.4871567759078831, "grad_norm": 13.326423645019531, "learning_rate": 1.405420726306466e-05, "logits/chosen": 1.2939577102661133, "logits/rejected": 1.271247148513794, "logps/chosen": -220.9195098876953, "logps/rejected": -238.4815216064453, "loss": 0.5738, "rewards/accuracies": 0.6833332777023315, "rewards/chosen": -2.6364405155181885, "rewards/margins": 1.2787940502166748, "rewards/rejected": -3.9152348041534424, "step": 1650 }, { "epoch": 0.4901092412164157, "grad_norm": 7.157304763793945, "learning_rate": 1.4053929731325657e-05, "logits/chosen": 0.875474750995636, "logits/rejected": 0.8687236905097961, "logps/chosen": -222.882568359375, "logps/rejected": -230.3934326171875, "loss": 0.6424, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -2.929285764694214, "rewards/margins": 0.6762797236442566, "rewards/rejected": -3.6055655479431152, "step": 1660 }, { "epoch": 0.49306170652494835, "grad_norm": 9.220298767089844, "learning_rate": 1.4053652199586654e-05, "logits/chosen": 1.0329129695892334, "logits/rejected": 1.0010924339294434, "logps/chosen": -220.41250610351562, "logps/rejected": -231.2218475341797, "loss": 0.3956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6779420375823975, "rewards/margins": 1.6635186672210693, "rewards/rejected": -4.341461181640625, "step": 1670 }, { "epoch": 0.49601417183348095, "grad_norm": 18.97873306274414, "learning_rate": 1.4053374667847654e-05, "logits/chosen": 1.139864444732666, "logits/rejected": 1.1273224353790283, "logps/chosen": -219.0438995361328, "logps/rejected": -232.8481903076172, "loss": 0.3938, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -2.6163673400878906, "rewards/margins": 1.7004365921020508, "rewards/rejected": -4.316803932189941, "step": 1680 }, { "epoch": 0.49896663714201356, "grad_norm": 6.0155253410339355, "learning_rate": 1.4053097136108651e-05, "logits/chosen": 0.6616982221603394, "logits/rejected": 0.6427015066146851, "logps/chosen": -218.3431396484375, "logps/rejected": -237.59915161132812, "loss": 0.3585, "rewards/accuracies": 0.8833333849906921, "rewards/chosen": -2.958918809890747, "rewards/margins": 1.4769586324691772, "rewards/rejected": -4.435877799987793, "step": 1690 }, { "epoch": 0.5019191024505462, "grad_norm": 12.585886001586914, "learning_rate": 1.4052819604369648e-05, "logits/chosen": 0.780805230140686, "logits/rejected": 0.7557088136672974, "logps/chosen": -215.06808471679688, "logps/rejected": -233.7985382080078, "loss": 0.4397, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.778411865234375, "rewards/margins": 1.5442287921905518, "rewards/rejected": -4.322640419006348, "step": 1700 }, { "epoch": 0.5048715677590788, "grad_norm": 14.795926094055176, "learning_rate": 1.4052542072630647e-05, "logits/chosen": 0.9700084924697876, "logits/rejected": 0.9421585202217102, "logps/chosen": -227.14212036132812, "logps/rejected": -246.5618438720703, "loss": 0.4464, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -3.4109604358673096, "rewards/margins": 1.6426721811294556, "rewards/rejected": -5.0536322593688965, "step": 1710 }, { "epoch": 0.5078240330676115, "grad_norm": 19.535480499267578, "learning_rate": 1.4052264540891646e-05, "logits/chosen": 0.8320516347885132, "logits/rejected": 0.8339737057685852, "logps/chosen": -222.58642578125, "logps/rejected": -235.4070281982422, "loss": 0.4357, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9155209064483643, "rewards/margins": 1.4115285873413086, "rewards/rejected": -4.32705020904541, "step": 1720 }, { "epoch": 0.510776498376144, "grad_norm": 11.918891906738281, "learning_rate": 1.4051987009152643e-05, "logits/chosen": 1.1524958610534668, "logits/rejected": 1.1292999982833862, "logps/chosen": -223.93179321289062, "logps/rejected": -240.9456329345703, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3810195922851562, "rewards/margins": 1.5262153148651123, "rewards/rejected": -4.907235145568848, "step": 1730 }, { "epoch": 0.5137289636846767, "grad_norm": 10.797476768493652, "learning_rate": 1.405170947741364e-05, "logits/chosen": 0.9855393171310425, "logits/rejected": 0.9627097249031067, "logps/chosen": -223.97811889648438, "logps/rejected": -245.0565643310547, "loss": 0.432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4321422576904297, "rewards/margins": 1.8874403238296509, "rewards/rejected": -5.319582939147949, "step": 1740 }, { "epoch": 0.5166814289932093, "grad_norm": 11.586099624633789, "learning_rate": 1.4051431945674639e-05, "logits/chosen": 0.9865446090698242, "logits/rejected": 0.9654587507247925, "logps/chosen": -222.3688507080078, "logps/rejected": -238.16259765625, "loss": 0.5369, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3.3233065605163574, "rewards/margins": 1.5000522136688232, "rewards/rejected": -4.82335901260376, "step": 1750 }, { "epoch": 0.519633894301742, "grad_norm": 9.1005859375, "learning_rate": 1.4051154413935638e-05, "logits/chosen": 1.0487946271896362, "logits/rejected": 1.0538784265518188, "logps/chosen": -221.91824340820312, "logps/rejected": -236.03854370117188, "loss": 0.367, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.861931800842285, "rewards/margins": 1.6440073251724243, "rewards/rejected": -4.50593900680542, "step": 1760 }, { "epoch": 0.5225863596102746, "grad_norm": 8.64305591583252, "learning_rate": 1.4050876882196635e-05, "logits/chosen": 1.2841771841049194, "logits/rejected": 1.2701022624969482, "logps/chosen": -222.2417449951172, "logps/rejected": -238.08642578125, "loss": 0.4388, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3.253349781036377, "rewards/margins": 1.687265396118164, "rewards/rejected": -4.940614700317383, "step": 1770 }, { "epoch": 0.5255388249188072, "grad_norm": 11.3748197555542, "learning_rate": 1.4050599350457632e-05, "logits/chosen": 1.3864820003509521, "logits/rejected": 1.3646023273468018, "logps/chosen": -226.59414672851562, "logps/rejected": -242.2103729248047, "loss": 0.4851, "rewards/accuracies": 0.7499999403953552, "rewards/chosen": -3.520637035369873, "rewards/margins": 1.6380409002304077, "rewards/rejected": -5.15867805480957, "step": 1780 }, { "epoch": 0.5284912902273399, "grad_norm": 12.428833961486816, "learning_rate": 1.405032181871863e-05, "logits/chosen": 1.0576988458633423, "logits/rejected": 1.036672592163086, "logps/chosen": -226.65328979492188, "logps/rejected": -240.326171875, "loss": 0.5184, "rewards/accuracies": 0.75, "rewards/chosen": -3.6989104747772217, "rewards/margins": 1.3362603187561035, "rewards/rejected": -5.035171031951904, "step": 1790 }, { "epoch": 0.5314437555358724, "grad_norm": 18.68917465209961, "learning_rate": 1.405004428697963e-05, "logits/chosen": 1.3553822040557861, "logits/rejected": 1.3416991233825684, "logps/chosen": -230.91238403320312, "logps/rejected": -237.5041046142578, "loss": 0.6367, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.82855486869812, "rewards/margins": 0.9431383013725281, "rewards/rejected": -4.771693229675293, "step": 1800 }, { "epoch": 0.5343962208444051, "grad_norm": 14.173099517822266, "learning_rate": 1.4049766755240626e-05, "logits/chosen": 0.8417409658432007, "logits/rejected": 0.825282871723175, "logps/chosen": -230.6946563720703, "logps/rejected": -243.65335083007812, "loss": 0.5327, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -3.6966805458068848, "rewards/margins": 1.4561761617660522, "rewards/rejected": -5.15285587310791, "step": 1810 }, { "epoch": 0.5373486861529377, "grad_norm": 5.303887367248535, "learning_rate": 1.4049489223501625e-05, "logits/chosen": 1.1815904378890991, "logits/rejected": 1.172903299331665, "logps/chosen": -222.25460815429688, "logps/rejected": -238.8775177001953, "loss": 0.4318, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -3.416632890701294, "rewards/margins": 1.7367494106292725, "rewards/rejected": -5.153382301330566, "step": 1820 }, { "epoch": 0.5403011514614703, "grad_norm": 7.323235511779785, "learning_rate": 1.4049211691762622e-05, "logits/chosen": 0.9596773982048035, "logits/rejected": 0.9527937769889832, "logps/chosen": -235.58395385742188, "logps/rejected": -245.30361938476562, "loss": 0.4726, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -4.078901767730713, "rewards/margins": 1.566620111465454, "rewards/rejected": -5.645521640777588, "step": 1830 }, { "epoch": 0.543253616770003, "grad_norm": 8.42373275756836, "learning_rate": 1.4048934160023621e-05, "logits/chosen": 1.2046916484832764, "logits/rejected": 1.1866157054901123, "logps/chosen": -221.65414428710938, "logps/rejected": -237.18887329101562, "loss": 0.528, "rewards/accuracies": 0.75, "rewards/chosen": -2.9302878379821777, "rewards/margins": 1.3599488735198975, "rewards/rejected": -4.290236473083496, "step": 1840 }, { "epoch": 0.5462060820785356, "grad_norm": 7.534632205963135, "learning_rate": 1.4048656628284618e-05, "logits/chosen": 0.9987422227859497, "logits/rejected": 0.9712702631950378, "logps/chosen": -219.38113403320312, "logps/rejected": -234.68527221679688, "loss": 0.4456, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": -3.0905537605285645, "rewards/margins": 1.396375060081482, "rewards/rejected": -4.486929416656494, "step": 1850 }, { "epoch": 0.5491585473870682, "grad_norm": 19.100399017333984, "learning_rate": 1.4048379096545617e-05, "logits/chosen": 0.7490253448486328, "logits/rejected": 0.7318816184997559, "logps/chosen": -218.13143920898438, "logps/rejected": -230.18252563476562, "loss": 0.5136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.60115385055542, "rewards/margins": 1.44405198097229, "rewards/rejected": -4.045206069946289, "step": 1860 }, { "epoch": 0.5521110126956008, "grad_norm": 17.510116577148438, "learning_rate": 1.4048101564806614e-05, "logits/chosen": 1.1358048915863037, "logits/rejected": 1.1227457523345947, "logps/chosen": -214.5306854248047, "logps/rejected": -226.809326171875, "loss": 0.3838, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -2.4399778842926025, "rewards/margins": 1.6445289850234985, "rewards/rejected": -4.084506511688232, "step": 1870 }, { "epoch": 0.5550634780041335, "grad_norm": 8.590580940246582, "learning_rate": 1.4047824033067613e-05, "logits/chosen": 1.3331992626190186, "logits/rejected": 1.3173277378082275, "logps/chosen": -212.5037841796875, "logps/rejected": -227.76535034179688, "loss": 0.4446, "rewards/accuracies": 0.75, "rewards/chosen": -2.0049571990966797, "rewards/margins": 1.5194203853607178, "rewards/rejected": -3.5243773460388184, "step": 1880 }, { "epoch": 0.5580159433126661, "grad_norm": 21.717100143432617, "learning_rate": 1.404754650132861e-05, "logits/chosen": 1.1179397106170654, "logits/rejected": 1.116748332977295, "logps/chosen": -217.95260620117188, "logps/rejected": -235.6158905029297, "loss": 0.3737, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -2.149440288543701, "rewards/margins": 1.720142126083374, "rewards/rejected": -3.869582414627075, "step": 1890 }, { "epoch": 0.5609684086211987, "grad_norm": 12.452489852905273, "learning_rate": 1.4047268969589608e-05, "logits/chosen": 0.7907549142837524, "logits/rejected": 0.7644432187080383, "logps/chosen": -225.06369018554688, "logps/rejected": -243.53646850585938, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": -3.5789577960968018, "rewards/margins": 1.2515678405761719, "rewards/rejected": -4.8305253982543945, "step": 1900 }, { "epoch": 0.5639208739297313, "grad_norm": 5.262327194213867, "learning_rate": 1.4046991437850605e-05, "logits/chosen": 0.9868022203445435, "logits/rejected": 0.9618912935256958, "logps/chosen": -236.4932098388672, "logps/rejected": -249.9724578857422, "loss": 0.484, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -3.9735381603240967, "rewards/margins": 1.5948079824447632, "rewards/rejected": -5.568346977233887, "step": 1910 }, { "epoch": 0.566873339238264, "grad_norm": 26.467538833618164, "learning_rate": 1.4046713906111602e-05, "logits/chosen": 1.0373772382736206, "logits/rejected": 1.0199997425079346, "logps/chosen": -235.2398681640625, "logps/rejected": -252.9575653076172, "loss": 0.4799, "rewards/accuracies": 0.7500001192092896, "rewards/chosen": -4.180903434753418, "rewards/margins": 1.5688059329986572, "rewards/rejected": -5.749709606170654, "step": 1920 }, { "epoch": 0.5698258045467965, "grad_norm": 5.759504795074463, "learning_rate": 1.4046436374372603e-05, "logits/chosen": 0.9874226450920105, "logits/rejected": 0.9612785577774048, "logps/chosen": -227.50479125976562, "logps/rejected": -242.74118041992188, "loss": 0.3919, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3.8661491870880127, "rewards/margins": 1.6610467433929443, "rewards/rejected": -5.527195930480957, "step": 1930 }, { "epoch": 0.5727782698553292, "grad_norm": 16.558284759521484, "learning_rate": 1.40461588426336e-05, "logits/chosen": 1.0014379024505615, "logits/rejected": 0.9708874821662903, "logps/chosen": -231.52206420898438, "logps/rejected": -245.1530303955078, "loss": 0.4791, "rewards/accuracies": 0.7499999403953552, "rewards/chosen": -3.9946048259735107, "rewards/margins": 1.5324041843414307, "rewards/rejected": -5.527009010314941, "step": 1940 }, { "epoch": 0.5757307351638619, "grad_norm": 12.350872039794922, "learning_rate": 1.4045881310894597e-05, "logits/chosen": 1.1987286806106567, "logits/rejected": 1.195168375968933, "logps/chosen": -231.8637237548828, "logps/rejected": -241.6493682861328, "loss": 0.5725, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.8819591999053955, "rewards/margins": 1.0772539377212524, "rewards/rejected": -4.9592132568359375, "step": 1950 }, { "epoch": 0.5786832004723944, "grad_norm": 8.358963012695312, "learning_rate": 1.4045603779155594e-05, "logits/chosen": 1.677512764930725, "logits/rejected": 1.662668228149414, "logps/chosen": -224.2736358642578, "logps/rejected": -241.71981811523438, "loss": 0.3999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8663012981414795, "rewards/margins": 1.6196224689483643, "rewards/rejected": -4.485924243927002, "step": 1960 }, { "epoch": 0.5816356657809271, "grad_norm": 3.003647804260254, "learning_rate": 1.4045326247416595e-05, "logits/chosen": 0.9913145899772644, "logits/rejected": 0.9819844961166382, "logps/chosen": -222.6855010986328, "logps/rejected": -243.7069549560547, "loss": 0.4263, "rewards/accuracies": 0.8166667819023132, "rewards/chosen": -2.7311346530914307, "rewards/margins": 1.9396950006484985, "rewards/rejected": -4.6708292961120605, "step": 1970 }, { "epoch": 0.5845881310894597, "grad_norm": 19.90190315246582, "learning_rate": 1.4045048715677592e-05, "logits/chosen": 1.0625743865966797, "logits/rejected": 1.041251301765442, "logps/chosen": -226.51095581054688, "logps/rejected": -242.73593139648438, "loss": 0.5825, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3.198983669281006, "rewards/margins": 1.2009245157241821, "rewards/rejected": -4.399908065795898, "step": 1980 }, { "epoch": 0.5875405963979923, "grad_norm": 6.051835060119629, "learning_rate": 1.4044771183938589e-05, "logits/chosen": 1.3144904375076294, "logits/rejected": 1.279841661453247, "logps/chosen": -225.904052734375, "logps/rejected": -242.86026000976562, "loss": 0.3898, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3.461348295211792, "rewards/margins": 1.6520971059799194, "rewards/rejected": -5.11344575881958, "step": 1990 }, { "epoch": 0.5904930617065249, "grad_norm": 10.591939926147461, "learning_rate": 1.4044493652199586e-05, "logits/chosen": 0.8760279417037964, "logits/rejected": 0.8518228530883789, "logps/chosen": -228.35293579101562, "logps/rejected": -237.56088256835938, "loss": 0.5904, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -3.9549038410186768, "rewards/margins": 1.0476272106170654, "rewards/rejected": -5.0025315284729, "step": 2000 }, { "epoch": 0.5934455270150576, "grad_norm": 27.022233963012695, "learning_rate": 1.4044216120460586e-05, "logits/chosen": 1.1633622646331787, "logits/rejected": 1.1502275466918945, "logps/chosen": -228.3351287841797, "logps/rejected": -244.27294921875, "loss": 0.3889, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -3.4606094360351562, "rewards/margins": 1.7647895812988281, "rewards/rejected": -5.225399017333984, "step": 2010 }, { "epoch": 0.5963979923235901, "grad_norm": 9.245481491088867, "learning_rate": 1.4043938588721583e-05, "logits/chosen": 0.872935950756073, "logits/rejected": 0.8526153564453125, "logps/chosen": -236.4927520751953, "logps/rejected": -249.2232666015625, "loss": 0.5111, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -4.318135738372803, "rewards/margins": 1.2097842693328857, "rewards/rejected": -5.527919769287109, "step": 2020 }, { "epoch": 0.5993504576321228, "grad_norm": 15.353803634643555, "learning_rate": 1.404366105698258e-05, "logits/chosen": 0.9331648945808411, "logits/rejected": 0.9285160303115845, "logps/chosen": -230.70065307617188, "logps/rejected": -250.22470092773438, "loss": 0.3648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.2150983810424805, "rewards/margins": 1.4726877212524414, "rewards/rejected": -5.687786102294922, "step": 2030 }, { "epoch": 0.6023029229406555, "grad_norm": 9.062928199768066, "learning_rate": 1.4043383525243579e-05, "logits/chosen": 1.2700622081756592, "logits/rejected": 1.2407115697860718, "logps/chosen": -230.49008178710938, "logps/rejected": -240.206298828125, "loss": 0.4702, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -3.629777431488037, "rewards/margins": 1.3207467794418335, "rewards/rejected": -4.950523853302002, "step": 2040 }, { "epoch": 0.6052553882491881, "grad_norm": 9.731832504272461, "learning_rate": 1.4043105993504578e-05, "logits/chosen": 1.1635135412216187, "logits/rejected": 1.1410480737686157, "logps/chosen": -233.210205078125, "logps/rejected": -244.0474090576172, "loss": 0.4228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6873791217803955, "rewards/margins": 1.4474737644195557, "rewards/rejected": -5.134853363037109, "step": 2050 }, { "epoch": 0.6082078535577207, "grad_norm": 10.926109313964844, "learning_rate": 1.4042828461765575e-05, "logits/chosen": 1.0731263160705566, "logits/rejected": 1.0537775754928589, "logps/chosen": -223.7839813232422, "logps/rejected": -240.07693481445312, "loss": 0.4807, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.0771522521972656, "rewards/margins": 1.5060920715332031, "rewards/rejected": -4.583244323730469, "step": 2060 }, { "epoch": 0.6111603188662533, "grad_norm": 12.83571720123291, "learning_rate": 1.4042550930026572e-05, "logits/chosen": 0.8637874722480774, "logits/rejected": 0.8471053242683411, "logps/chosen": -227.87802124023438, "logps/rejected": -246.36019897460938, "loss": 0.5352, "rewards/accuracies": 0.75, "rewards/chosen": -3.6862621307373047, "rewards/margins": 1.4671632051467896, "rewards/rejected": -5.153425216674805, "step": 2070 }, { "epoch": 0.614112784174786, "grad_norm": 10.36312198638916, "learning_rate": 1.404227339828757e-05, "logits/chosen": 1.0129345655441284, "logits/rejected": 0.9849379658699036, "logps/chosen": -234.10733032226562, "logps/rejected": -244.87857055664062, "loss": 0.4853, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -3.9878640174865723, "rewards/margins": 1.3107620477676392, "rewards/rejected": -5.298625946044922, "step": 2080 }, { "epoch": 0.6170652494833185, "grad_norm": 6.613684177398682, "learning_rate": 1.404199586654857e-05, "logits/chosen": 0.8815206289291382, "logits/rejected": 0.8608744740486145, "logps/chosen": -231.21084594726562, "logps/rejected": -254.2685546875, "loss": 0.4578, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -4.37717342376709, "rewards/margins": 1.6401240825653076, "rewards/rejected": -6.017297267913818, "step": 2090 }, { "epoch": 0.6200177147918512, "grad_norm": 9.825085639953613, "learning_rate": 1.4041718334809567e-05, "logits/chosen": 0.820507824420929, "logits/rejected": 0.8040598034858704, "logps/chosen": -242.6494140625, "logps/rejected": -261.32440185546875, "loss": 0.4402, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -5.117182731628418, "rewards/margins": 1.6026933193206787, "rewards/rejected": -6.719876289367676, "step": 2100 }, { "epoch": 0.6229701801003839, "grad_norm": 14.793448448181152, "learning_rate": 1.4041440803070564e-05, "logits/chosen": 0.974295973777771, "logits/rejected": 0.9561226963996887, "logps/chosen": -246.47152709960938, "logps/rejected": -259.78192138671875, "loss": 0.5239, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -4.969615936279297, "rewards/margins": 1.4342807531356812, "rewards/rejected": -6.403896331787109, "step": 2110 }, { "epoch": 0.6259226454089164, "grad_norm": 9.398240089416504, "learning_rate": 1.4041163271331562e-05, "logits/chosen": 1.1968295574188232, "logits/rejected": 1.1665948629379272, "logps/chosen": -234.8848876953125, "logps/rejected": -254.34228515625, "loss": 0.2659, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -4.103208065032959, "rewards/margins": 2.2951102256774902, "rewards/rejected": -6.398318290710449, "step": 2120 }, { "epoch": 0.6288751107174491, "grad_norm": 12.459007263183594, "learning_rate": 1.4040885739592561e-05, "logits/chosen": 1.0309189558029175, "logits/rejected": 1.0288994312286377, "logps/chosen": -233.79837036132812, "logps/rejected": -248.22622680664062, "loss": 0.502, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -4.376748561859131, "rewards/margins": 1.4828442335128784, "rewards/rejected": -5.859592437744141, "step": 2130 }, { "epoch": 0.6318275760259817, "grad_norm": 6.858120441436768, "learning_rate": 1.4040608207853558e-05, "logits/chosen": 1.1295182704925537, "logits/rejected": 1.1081479787826538, "logps/chosen": -234.6557159423828, "logps/rejected": -248.0467071533203, "loss": 0.3923, "rewards/accuracies": 0.8500000834465027, "rewards/chosen": -4.078256130218506, "rewards/margins": 1.8706896305084229, "rewards/rejected": -5.94894552230835, "step": 2140 }, { "epoch": 0.6347800413345143, "grad_norm": 17.4219970703125, "learning_rate": 1.4040330676114557e-05, "logits/chosen": 0.9971069097518921, "logits/rejected": 0.9885370135307312, "logps/chosen": -227.97012329101562, "logps/rejected": -248.10964965820312, "loss": 0.5891, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -4.128756999969482, "rewards/margins": 1.1541001796722412, "rewards/rejected": -5.282857418060303, "step": 2150 }, { "epoch": 0.6377325066430469, "grad_norm": 12.234622955322266, "learning_rate": 1.4040053144375554e-05, "logits/chosen": 0.8278120756149292, "logits/rejected": 0.7895926833152771, "logps/chosen": -228.8499298095703, "logps/rejected": -252.95556640625, "loss": 0.4553, "rewards/accuracies": 0.8166666030883789, "rewards/chosen": -3.949889659881592, "rewards/margins": 1.6907211542129517, "rewards/rejected": -5.640610694885254, "step": 2160 }, { "epoch": 0.6406849719515796, "grad_norm": 17.03032112121582, "learning_rate": 1.4039775612636553e-05, "logits/chosen": 0.8894211053848267, "logits/rejected": 0.8766523599624634, "logps/chosen": -244.3411407470703, "logps/rejected": -259.0534973144531, "loss": 0.4464, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -5.021935939788818, "rewards/margins": 1.6013896465301514, "rewards/rejected": -6.623326301574707, "step": 2170 }, { "epoch": 0.6436374372601122, "grad_norm": 20.72562599182129, "learning_rate": 1.403949808089755e-05, "logits/chosen": 1.0182175636291504, "logits/rejected": 1.011072039604187, "logps/chosen": -233.48745727539062, "logps/rejected": -248.37051391601562, "loss": 0.5878, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -4.844840049743652, "rewards/margins": 1.1901451349258423, "rewards/rejected": -6.034984588623047, "step": 2180 }, { "epoch": 0.6465899025686448, "grad_norm": 15.929892539978027, "learning_rate": 1.4039220549158549e-05, "logits/chosen": 1.202858805656433, "logits/rejected": 1.1768066883087158, "logps/chosen": -241.51107788085938, "logps/rejected": -255.667236328125, "loss": 0.5152, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4.670727729797363, "rewards/margins": 1.4909489154815674, "rewards/rejected": -6.161676406860352, "step": 2190 }, { "epoch": 0.6495423678771775, "grad_norm": 11.151036262512207, "learning_rate": 1.4038943017419546e-05, "logits/chosen": 1.0777482986450195, "logits/rejected": 1.0513756275177002, "logps/chosen": -230.17001342773438, "logps/rejected": -247.788818359375, "loss": 0.5603, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4.220430850982666, "rewards/margins": 1.5376970767974854, "rewards/rejected": -5.758127689361572, "step": 2200 }, { "epoch": 0.6524948331857101, "grad_norm": 13.395500183105469, "learning_rate": 1.4038665485680543e-05, "logits/chosen": 0.8881146311759949, "logits/rejected": 0.8726558685302734, "logps/chosen": -226.69577026367188, "logps/rejected": -241.9739990234375, "loss": 0.5104, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4.047578811645508, "rewards/margins": 1.2345280647277832, "rewards/rejected": -5.282106876373291, "step": 2210 }, { "epoch": 0.6554472984942427, "grad_norm": 20.05646514892578, "learning_rate": 1.4038387953941541e-05, "logits/chosen": 0.6570107340812683, "logits/rejected": 0.6378307342529297, "logps/chosen": -241.9140167236328, "logps/rejected": -249.8760223388672, "loss": 0.4159, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -4.898801326751709, "rewards/margins": 1.3823115825653076, "rewards/rejected": -6.2811126708984375, "step": 2220 }, { "epoch": 0.6583997638027753, "grad_norm": 12.276981353759766, "learning_rate": 1.403811042220254e-05, "logits/chosen": 0.9142853617668152, "logits/rejected": 0.8974307179450989, "logps/chosen": -248.7666015625, "logps/rejected": -259.70947265625, "loss": 0.5373, "rewards/accuracies": 0.75, "rewards/chosen": -5.609926223754883, "rewards/margins": 1.2506811618804932, "rewards/rejected": -6.860607147216797, "step": 2230 }, { "epoch": 0.661352229111308, "grad_norm": 17.80704689025879, "learning_rate": 1.4037832890463537e-05, "logits/chosen": 0.9607647061347961, "logits/rejected": 0.9193191528320312, "logps/chosen": -242.05087280273438, "logps/rejected": -265.2187194824219, "loss": 0.4095, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -5.107037544250488, "rewards/margins": 1.688391923904419, "rewards/rejected": -6.7954301834106445, "step": 2240 }, { "epoch": 0.6643046944198405, "grad_norm": 3.7486793994903564, "learning_rate": 1.4037555358724534e-05, "logits/chosen": 0.7987117171287537, "logits/rejected": 0.7713675498962402, "logps/chosen": -240.5265350341797, "logps/rejected": -256.04736328125, "loss": 0.4367, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -4.784816741943359, "rewards/margins": 1.469229817390442, "rewards/rejected": -6.254046440124512, "step": 2250 }, { "epoch": 0.6672571597283732, "grad_norm": 10.359475135803223, "learning_rate": 1.4037277826985533e-05, "logits/chosen": 1.138303279876709, "logits/rejected": 1.1249371767044067, "logps/chosen": -236.57675170898438, "logps/rejected": -253.843994140625, "loss": 0.3382, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -4.3568010330200195, "rewards/margins": 2.18798828125, "rewards/rejected": -6.5447893142700195, "step": 2260 }, { "epoch": 0.6702096250369058, "grad_norm": 9.39584732055664, "learning_rate": 1.4037000295246532e-05, "logits/chosen": 1.1608169078826904, "logits/rejected": 1.1355772018432617, "logps/chosen": -244.9860076904297, "logps/rejected": -261.56878662109375, "loss": 0.3309, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -4.910251140594482, "rewards/margins": 2.0631957054138184, "rewards/rejected": -6.973446846008301, "step": 2270 }, { "epoch": 0.6731620903454384, "grad_norm": 3.884392738342285, "learning_rate": 1.4036722763507529e-05, "logits/chosen": 0.9657142758369446, "logits/rejected": 0.9516152143478394, "logps/chosen": -245.64956665039062, "logps/rejected": -252.88302612304688, "loss": 0.5674, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -4.9712324142456055, "rewards/margins": 1.384265422821045, "rewards/rejected": -6.355496883392334, "step": 2280 }, { "epoch": 0.676114555653971, "grad_norm": 10.185685157775879, "learning_rate": 1.4036445231768526e-05, "logits/chosen": 0.8114938735961914, "logits/rejected": 0.7803478240966797, "logps/chosen": -237.25473022460938, "logps/rejected": -257.38665771484375, "loss": 0.3119, "rewards/accuracies": 0.8666667938232422, "rewards/chosen": -4.3121843338012695, "rewards/margins": 2.25675630569458, "rewards/rejected": -6.56894063949585, "step": 2290 }, { "epoch": 0.6790670209625037, "grad_norm": 10.484964370727539, "learning_rate": 1.4036167700029526e-05, "logits/chosen": 1.2228447198867798, "logits/rejected": 1.2026489973068237, "logps/chosen": -243.5087432861328, "logps/rejected": -255.73046875, "loss": 0.5278, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -4.950496673583984, "rewards/margins": 1.6659832000732422, "rewards/rejected": -6.616479396820068, "step": 2300 }, { "epoch": 0.6820194862710364, "grad_norm": 8.377910614013672, "learning_rate": 1.4035890168290523e-05, "logits/chosen": 1.3315393924713135, "logits/rejected": 1.3067361116409302, "logps/chosen": -236.59765625, "logps/rejected": -260.25341796875, "loss": 0.4002, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": -4.409584999084473, "rewards/margins": 1.6209160089492798, "rewards/rejected": -6.030501365661621, "step": 2310 }, { "epoch": 0.6849719515795689, "grad_norm": 5.50555944442749, "learning_rate": 1.403561263655152e-05, "logits/chosen": 1.3287485837936401, "logits/rejected": 1.3010895252227783, "logps/chosen": -238.667236328125, "logps/rejected": -250.6184539794922, "loss": 0.5102, "rewards/accuracies": 0.75, "rewards/chosen": -4.645434379577637, "rewards/margins": 1.3829964399337769, "rewards/rejected": -6.028430938720703, "step": 2320 }, { "epoch": 0.6879244168881016, "grad_norm": 16.55809211730957, "learning_rate": 1.4035335104812518e-05, "logits/chosen": 0.8255594372749329, "logits/rejected": 0.7837125062942505, "logps/chosen": -241.9386444091797, "logps/rejected": -258.86724853515625, "loss": 0.4782, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": -4.959227561950684, "rewards/margins": 1.6129531860351562, "rewards/rejected": -6.572180271148682, "step": 2330 }, { "epoch": 0.6908768821966342, "grad_norm": 4.937407493591309, "learning_rate": 1.4035057573073518e-05, "logits/chosen": 0.9412003755569458, "logits/rejected": 0.9088653326034546, "logps/chosen": -242.14834594726562, "logps/rejected": -260.61187744140625, "loss": 0.3496, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -4.739530086517334, "rewards/margins": 1.7287575006484985, "rewards/rejected": -6.468287467956543, "step": 2340 }, { "epoch": 0.6938293475051668, "grad_norm": 11.034295082092285, "learning_rate": 1.4034780041334515e-05, "logits/chosen": 1.2386988401412964, "logits/rejected": 1.2332566976547241, "logps/chosen": -239.6439971923828, "logps/rejected": -250.81063842773438, "loss": 0.6342, "rewards/accuracies": 0.7166667580604553, "rewards/chosen": -4.55072021484375, "rewards/margins": 1.103173017501831, "rewards/rejected": -5.653892993927002, "step": 2350 }, { "epoch": 0.6967818128136994, "grad_norm": 25.864639282226562, "learning_rate": 1.4034502509595512e-05, "logits/chosen": 1.0369950532913208, "logits/rejected": 1.0104267597198486, "logps/chosen": -234.9955291748047, "logps/rejected": -250.1712188720703, "loss": 0.6448, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -3.991518497467041, "rewards/margins": 1.3746110200881958, "rewards/rejected": -5.366128921508789, "step": 2360 }, { "epoch": 0.6997342781222321, "grad_norm": 7.241127014160156, "learning_rate": 1.4034224977856511e-05, "logits/chosen": 1.3406105041503906, "logits/rejected": 1.3297383785247803, "logps/chosen": -224.3379669189453, "logps/rejected": -238.83035278320312, "loss": 0.524, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.185976505279541, "rewards/margins": 1.4703572988510132, "rewards/rejected": -4.6563334465026855, "step": 2370 }, { "epoch": 0.7026867434307646, "grad_norm": 7.683022975921631, "learning_rate": 1.403394744611751e-05, "logits/chosen": 1.1014691591262817, "logits/rejected": 1.0813627243041992, "logps/chosen": -218.69140625, "logps/rejected": -236.03500366210938, "loss": 0.4076, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -2.78704571723938, "rewards/margins": 1.4113819599151611, "rewards/rejected": -4.198427200317383, "step": 2380 }, { "epoch": 0.7056392087392973, "grad_norm": 17.891355514526367, "learning_rate": 1.4033669914378507e-05, "logits/chosen": 0.9508160352706909, "logits/rejected": 0.9224417805671692, "logps/chosen": -227.2688751220703, "logps/rejected": -246.26443481445312, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": -3.511204242706299, "rewards/margins": 1.7264608144760132, "rewards/rejected": -5.237664699554443, "step": 2390 }, { "epoch": 0.70859167404783, "grad_norm": 4.373746871948242, "learning_rate": 1.4033392382639504e-05, "logits/chosen": 1.1645891666412354, "logits/rejected": 1.136260747909546, "logps/chosen": -234.36471557617188, "logps/rejected": -250.82449340820312, "loss": 0.5433, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -4.435126781463623, "rewards/margins": 1.518613576889038, "rewards/rejected": -5.953740119934082, "step": 2400 }, { "epoch": 0.7115441393563625, "grad_norm": 5.791463851928711, "learning_rate": 1.4033114850900503e-05, "logits/chosen": 0.7815160751342773, "logits/rejected": 0.7622783780097961, "logps/chosen": -242.0283203125, "logps/rejected": -256.458984375, "loss": 0.4417, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -4.850489139556885, "rewards/margins": 1.6950098276138306, "rewards/rejected": -6.545498847961426, "step": 2410 }, { "epoch": 0.7144966046648952, "grad_norm": 15.033862113952637, "learning_rate": 1.4032837319161501e-05, "logits/chosen": 0.9615638852119446, "logits/rejected": 0.9463192224502563, "logps/chosen": -236.8530731201172, "logps/rejected": -245.20242309570312, "loss": 0.4772, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -4.114894866943359, "rewards/margins": 1.3743619918823242, "rewards/rejected": -5.489256858825684, "step": 2420 }, { "epoch": 0.7174490699734278, "grad_norm": 12.056793212890625, "learning_rate": 1.4032559787422498e-05, "logits/chosen": 0.9110945463180542, "logits/rejected": 0.8867303133010864, "logps/chosen": -231.6307373046875, "logps/rejected": -250.95510864257812, "loss": 0.3949, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -3.786421537399292, "rewards/margins": 1.8261181116104126, "rewards/rejected": -5.612539768218994, "step": 2430 }, { "epoch": 0.7204015352819605, "grad_norm": 11.408912658691406, "learning_rate": 1.4032282255683495e-05, "logits/chosen": 1.0933301448822021, "logits/rejected": 1.070220708847046, "logps/chosen": -235.52560424804688, "logps/rejected": -253.555908203125, "loss": 0.3981, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -4.215879917144775, "rewards/margins": 1.734018325805664, "rewards/rejected": -5.949898719787598, "step": 2440 }, { "epoch": 0.723354000590493, "grad_norm": 6.645534038543701, "learning_rate": 1.4032004723944494e-05, "logits/chosen": 0.974618136882782, "logits/rejected": 0.9645155668258667, "logps/chosen": -226.51382446289062, "logps/rejected": -248.6309051513672, "loss": 0.4007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.491992950439453, "rewards/margins": 1.7234134674072266, "rewards/rejected": -5.2154059410095215, "step": 2450 }, { "epoch": 0.7263064658990257, "grad_norm": 12.981518745422363, "learning_rate": 1.4031727192205493e-05, "logits/chosen": 1.0116431713104248, "logits/rejected": 0.9918069839477539, "logps/chosen": -225.6684112548828, "logps/rejected": -244.9346923828125, "loss": 0.4595, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -3.572920322418213, "rewards/margins": 1.903009057044983, "rewards/rejected": -5.475928783416748, "step": 2460 }, { "epoch": 0.7292589312075584, "grad_norm": 9.995537757873535, "learning_rate": 1.403144966046649e-05, "logits/chosen": 1.435523509979248, "logits/rejected": 1.4031906127929688, "logps/chosen": -222.85739135742188, "logps/rejected": -235.7170867919922, "loss": 0.4549, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -3.208177089691162, "rewards/margins": 1.337496042251587, "rewards/rejected": -4.545673370361328, "step": 2470 }, { "epoch": 0.7322113965160909, "grad_norm": 12.582255363464355, "learning_rate": 1.4031172128727487e-05, "logits/chosen": 0.975744903087616, "logits/rejected": 0.9560412168502808, "logps/chosen": -219.05313110351562, "logps/rejected": -236.1013641357422, "loss": 0.3689, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.412224531173706, "rewards/margins": 1.6467090845108032, "rewards/rejected": -4.058933734893799, "step": 2480 }, { "epoch": 0.7351638618246236, "grad_norm": 25.51331329345703, "learning_rate": 1.4030894596988486e-05, "logits/chosen": 0.9571698904037476, "logits/rejected": 0.9359889030456543, "logps/chosen": -219.0782012939453, "logps/rejected": -234.6865234375, "loss": 0.5225, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -2.903264045715332, "rewards/margins": 1.2031621932983398, "rewards/rejected": -4.106425762176514, "step": 2490 }, { "epoch": 0.7381163271331562, "grad_norm": 12.200304985046387, "learning_rate": 1.4030617065249485e-05, "logits/chosen": 1.157606840133667, "logits/rejected": 1.1488487720489502, "logps/chosen": -225.23046875, "logps/rejected": -242.68603515625, "loss": 0.3447, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -3.000967025756836, "rewards/margins": 1.6129741668701172, "rewards/rejected": -4.613941192626953, "step": 2500 }, { "epoch": 0.7410687924416888, "grad_norm": 9.763965606689453, "learning_rate": 1.4030339533510482e-05, "logits/chosen": 0.9794250726699829, "logits/rejected": 0.9458015561103821, "logps/chosen": -231.7982177734375, "logps/rejected": -238.81948852539062, "loss": 0.5425, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -4.175982475280762, "rewards/margins": 1.2466645240783691, "rewards/rejected": -5.422646999359131, "step": 2510 }, { "epoch": 0.7440212577502214, "grad_norm": 13.859847068786621, "learning_rate": 1.403006200177148e-05, "logits/chosen": 1.012086272239685, "logits/rejected": 1.0024627447128296, "logps/chosen": -249.3809051513672, "logps/rejected": -264.6347961425781, "loss": 0.3411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.478275299072266, "rewards/margins": 1.6110693216323853, "rewards/rejected": -7.0893449783325195, "step": 2520 }, { "epoch": 0.7469737230587541, "grad_norm": 13.950196266174316, "learning_rate": 1.4029784470032477e-05, "logits/chosen": 0.872369110584259, "logits/rejected": 0.861441969871521, "logps/chosen": -253.2641143798828, "logps/rejected": -261.0155944824219, "loss": 0.401, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -5.971985816955566, "rewards/margins": 1.4231771230697632, "rewards/rejected": -7.395163059234619, "step": 2530 }, { "epoch": 0.7499261883672866, "grad_norm": 13.11074161529541, "learning_rate": 1.4029506938293475e-05, "logits/chosen": 1.0967719554901123, "logits/rejected": 1.086946725845337, "logps/chosen": -254.156982421875, "logps/rejected": -271.4275207519531, "loss": 0.4818, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -5.886039733886719, "rewards/margins": 1.4467670917510986, "rewards/rejected": -7.3328070640563965, "step": 2540 }, { "epoch": 0.7528786536758193, "grad_norm": 12.299427032470703, "learning_rate": 1.4029229406554473e-05, "logits/chosen": 1.1394317150115967, "logits/rejected": 1.131117343902588, "logps/chosen": -244.83602905273438, "logps/rejected": -261.69610595703125, "loss": 0.4325, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -5.064723968505859, "rewards/margins": 1.6214224100112915, "rewards/rejected": -6.6861467361450195, "step": 2550 }, { "epoch": 0.755831118984352, "grad_norm": 26.170480728149414, "learning_rate": 1.4028951874815472e-05, "logits/chosen": 1.1286168098449707, "logits/rejected": 1.1231980323791504, "logps/chosen": -232.57666015625, "logps/rejected": -249.7123260498047, "loss": 0.3557, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.25711727142334, "rewards/margins": 2.0579447746276855, "rewards/rejected": -6.315062522888184, "step": 2560 }, { "epoch": 0.7587835842928845, "grad_norm": 8.043501853942871, "learning_rate": 1.4028674343076469e-05, "logits/chosen": 0.7695469260215759, "logits/rejected": 0.7497983574867249, "logps/chosen": -237.1902618408203, "logps/rejected": -256.79168701171875, "loss": 0.3995, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.340848445892334, "rewards/margins": 2.0121982097625732, "rewards/rejected": -6.3530473709106445, "step": 2570 }, { "epoch": 0.7617360496014172, "grad_norm": 13.420774459838867, "learning_rate": 1.4028396811337466e-05, "logits/chosen": 0.9394356608390808, "logits/rejected": 0.9137392044067383, "logps/chosen": -239.7417449951172, "logps/rejected": -268.3270263671875, "loss": 0.3398, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4.475794792175293, "rewards/margins": 2.4867758750915527, "rewards/rejected": -6.9625701904296875, "step": 2580 }, { "epoch": 0.7646885149099498, "grad_norm": 5.305803298950195, "learning_rate": 1.4028119279598465e-05, "logits/chosen": 1.064272165298462, "logits/rejected": 1.0432249307632446, "logps/chosen": -240.0825653076172, "logps/rejected": -259.0626220703125, "loss": 0.4594, "rewards/accuracies": 0.75, "rewards/chosen": -4.515328884124756, "rewards/margins": 1.9983574151992798, "rewards/rejected": -6.5136871337890625, "step": 2590 }, { "epoch": 0.7676409802184825, "grad_norm": 18.528532028198242, "learning_rate": 1.4027841747859464e-05, "logits/chosen": 1.2895772457122803, "logits/rejected": 1.286705732345581, "logps/chosen": -239.95468139648438, "logps/rejected": -259.3045654296875, "loss": 0.3966, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.733645439147949, "rewards/margins": 2.145394802093506, "rewards/rejected": -6.879039764404297, "step": 2600 }, { "epoch": 0.770593445527015, "grad_norm": 3.6851065158843994, "learning_rate": 1.402756421612046e-05, "logits/chosen": 1.0075557231903076, "logits/rejected": 0.9830906987190247, "logps/chosen": -254.1927490234375, "logps/rejected": -276.2006530761719, "loss": 0.41, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -6.045876979827881, "rewards/margins": 1.9274005889892578, "rewards/rejected": -7.9732770919799805, "step": 2610 }, { "epoch": 0.7735459108355477, "grad_norm": 6.970127582550049, "learning_rate": 1.4027286684381458e-05, "logits/chosen": 0.9266250729560852, "logits/rejected": 0.902758777141571, "logps/chosen": -254.53280639648438, "logps/rejected": -268.7155456542969, "loss": 0.5154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.1708455085754395, "rewards/margins": 1.680927038192749, "rewards/rejected": -7.851772308349609, "step": 2620 }, { "epoch": 0.7764983761440803, "grad_norm": 11.729626655578613, "learning_rate": 1.4027009152642458e-05, "logits/chosen": 0.8939005136489868, "logits/rejected": 0.8732063174247742, "logps/chosen": -239.05538940429688, "logps/rejected": -266.0872497558594, "loss": 0.3814, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -4.949268341064453, "rewards/margins": 2.0623912811279297, "rewards/rejected": -7.011659145355225, "step": 2630 }, { "epoch": 0.7794508414526129, "grad_norm": 11.137190818786621, "learning_rate": 1.4026731620903455e-05, "logits/chosen": 1.1265472173690796, "logits/rejected": 1.0975806713104248, "logps/chosen": -236.5463409423828, "logps/rejected": -260.35028076171875, "loss": 0.4387, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.345498561859131, "rewards/margins": 2.2427375316619873, "rewards/rejected": -6.588236331939697, "step": 2640 }, { "epoch": 0.7824033067611456, "grad_norm": 9.975071907043457, "learning_rate": 1.4026454089164452e-05, "logits/chosen": 1.1152597665786743, "logits/rejected": 1.093147873878479, "logps/chosen": -237.9548797607422, "logps/rejected": -264.3440246582031, "loss": 0.3006, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4.353127479553223, "rewards/margins": 2.7241408824920654, "rewards/rejected": -7.077267646789551, "step": 2650 }, { "epoch": 0.7853557720696782, "grad_norm": 3.8370718955993652, "learning_rate": 1.402617655742545e-05, "logits/chosen": 0.7662557363510132, "logits/rejected": 0.7169517278671265, "logps/chosen": -237.4637451171875, "logps/rejected": -263.3134765625, "loss": 0.3008, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4.760621070861816, "rewards/margins": 2.4962847232818604, "rewards/rejected": -7.256906032562256, "step": 2660 }, { "epoch": 0.7883082373782108, "grad_norm": 5.330772876739502, "learning_rate": 1.402589902568645e-05, "logits/chosen": 0.9617869257926941, "logits/rejected": 0.946183979511261, "logps/chosen": -243.9727783203125, "logps/rejected": -266.723876953125, "loss": 0.4267, "rewards/accuracies": 0.8166667819023132, "rewards/chosen": -5.049371719360352, "rewards/margins": 1.7264235019683838, "rewards/rejected": -6.77579402923584, "step": 2670 }, { "epoch": 0.7912607026867434, "grad_norm": 7.223052978515625, "learning_rate": 1.4025621493947447e-05, "logits/chosen": 1.27965247631073, "logits/rejected": 1.2545912265777588, "logps/chosen": -237.6911163330078, "logps/rejected": -261.2434387207031, "loss": 0.4806, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.894692420959473, "rewards/margins": 1.7380176782608032, "rewards/rejected": -6.632709503173828, "step": 2680 }, { "epoch": 0.7942131679952761, "grad_norm": 9.996981620788574, "learning_rate": 1.4025343962208444e-05, "logits/chosen": 1.330180287361145, "logits/rejected": 1.3154727220535278, "logps/chosen": -248.03994750976562, "logps/rejected": -263.87591552734375, "loss": 0.4803, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -5.522799968719482, "rewards/margins": 1.5011037588119507, "rewards/rejected": -7.023903846740723, "step": 2690 }, { "epoch": 0.7971656333038086, "grad_norm": 10.904199600219727, "learning_rate": 1.4025066430469443e-05, "logits/chosen": 1.2016797065734863, "logits/rejected": 1.182081937789917, "logps/chosen": -250.35025024414062, "logps/rejected": -271.3921813964844, "loss": 0.5168, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -5.97730827331543, "rewards/margins": 1.6940381526947021, "rewards/rejected": -7.6713457107543945, "step": 2700 }, { "epoch": 0.8001180986123413, "grad_norm": 21.9698486328125, "learning_rate": 1.4024788898730442e-05, "logits/chosen": 1.1048212051391602, "logits/rejected": 1.0970510244369507, "logps/chosen": -255.02041625976562, "logps/rejected": -269.94573974609375, "loss": 0.5597, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -6.0393266677856445, "rewards/margins": 1.5044111013412476, "rewards/rejected": -7.543737888336182, "step": 2710 }, { "epoch": 0.803070563920874, "grad_norm": 10.842463493347168, "learning_rate": 1.4024511366991439e-05, "logits/chosen": 1.13896644115448, "logits/rejected": 1.1085104942321777, "logps/chosen": -238.8287811279297, "logps/rejected": -264.34283447265625, "loss": 0.4323, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.872581958770752, "rewards/margins": 2.3927454948425293, "rewards/rejected": -7.265327453613281, "step": 2720 }, { "epoch": 0.8060230292294066, "grad_norm": 13.687220573425293, "learning_rate": 1.4024233835252436e-05, "logits/chosen": 0.895276665687561, "logits/rejected": 0.8906949162483215, "logps/chosen": -240.13809204101562, "logps/rejected": -257.04986572265625, "loss": 0.4616, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -4.959841251373291, "rewards/margins": 1.6723461151123047, "rewards/rejected": -6.632187843322754, "step": 2730 }, { "epoch": 0.8089754945379392, "grad_norm": 22.552444458007812, "learning_rate": 1.4023956303513434e-05, "logits/chosen": 0.8243335485458374, "logits/rejected": 0.8133773803710938, "logps/chosen": -254.28396606445312, "logps/rejected": -272.76629638671875, "loss": 0.5063, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -6.00461483001709, "rewards/margins": 2.066737651824951, "rewards/rejected": -8.0713529586792, "step": 2740 }, { "epoch": 0.8119279598464718, "grad_norm": 6.763545989990234, "learning_rate": 1.4023678771774433e-05, "logits/chosen": 1.1901601552963257, "logits/rejected": 1.1711593866348267, "logps/chosen": -245.2090301513672, "logps/rejected": -265.5359802246094, "loss": 0.3186, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -5.573644638061523, "rewards/margins": 2.1917998790740967, "rewards/rejected": -7.765444755554199, "step": 2750 }, { "epoch": 0.8148804251550045, "grad_norm": 15.926094055175781, "learning_rate": 1.402340124003543e-05, "logits/chosen": 0.8387190103530884, "logits/rejected": 0.8163677453994751, "logps/chosen": -248.794921875, "logps/rejected": -267.63897705078125, "loss": 0.4531, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -5.871561527252197, "rewards/margins": 1.714848518371582, "rewards/rejected": -7.5864105224609375, "step": 2760 }, { "epoch": 0.817832890463537, "grad_norm": 8.655818939208984, "learning_rate": 1.4023123708296427e-05, "logits/chosen": 1.0133886337280273, "logits/rejected": 0.9869579076766968, "logps/chosen": -244.85403442382812, "logps/rejected": -272.19378662109375, "loss": 0.28, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -5.4621734619140625, "rewards/margins": 2.4782052040100098, "rewards/rejected": -7.9403791427612305, "step": 2770 }, { "epoch": 0.8207853557720697, "grad_norm": 19.17549705505371, "learning_rate": 1.4022846176557426e-05, "logits/chosen": 1.1697003841400146, "logits/rejected": 1.161454439163208, "logps/chosen": -244.40939331054688, "logps/rejected": -267.5545349121094, "loss": 0.4119, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -5.623203277587891, "rewards/margins": 2.322509288787842, "rewards/rejected": -7.945713043212891, "step": 2780 }, { "epoch": 0.8237378210806023, "grad_norm": 5.153756141662598, "learning_rate": 1.4022568644818425e-05, "logits/chosen": 1.1297985315322876, "logits/rejected": 1.1130739450454712, "logps/chosen": -247.5915985107422, "logps/rejected": -269.71044921875, "loss": 0.3097, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.40482234954834, "rewards/margins": 2.3384013175964355, "rewards/rejected": -7.743224143981934, "step": 2790 }, { "epoch": 0.8266902863891349, "grad_norm": 15.042340278625488, "learning_rate": 1.4022291113079422e-05, "logits/chosen": 0.9391604661941528, "logits/rejected": 0.9196814298629761, "logps/chosen": -244.907958984375, "logps/rejected": -261.3716125488281, "loss": 0.4468, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -4.917660713195801, "rewards/margins": 2.008544445037842, "rewards/rejected": -6.926206111907959, "step": 2800 }, { "epoch": 0.8296427516976675, "grad_norm": 16.085792541503906, "learning_rate": 1.4022013581340419e-05, "logits/chosen": 0.7679022550582886, "logits/rejected": 0.76078200340271, "logps/chosen": -232.386474609375, "logps/rejected": -250.2677764892578, "loss": 0.3759, "rewards/accuracies": 0.8166667819023132, "rewards/chosen": -4.374277114868164, "rewards/margins": 1.6837778091430664, "rewards/rejected": -6.0580549240112305, "step": 2810 }, { "epoch": 0.8325952170062002, "grad_norm": 10.890873908996582, "learning_rate": 1.4021736049601418e-05, "logits/chosen": 0.9712487459182739, "logits/rejected": 0.9764541387557983, "logps/chosen": -237.28390502929688, "logps/rejected": -248.5562744140625, "loss": 0.6268, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -4.542433738708496, "rewards/margins": 1.140343427658081, "rewards/rejected": -5.6827778816223145, "step": 2820 }, { "epoch": 0.8355476823147328, "grad_norm": 19.612770080566406, "learning_rate": 1.4021458517862416e-05, "logits/chosen": 1.0678365230560303, "logits/rejected": 1.0637309551239014, "logps/chosen": -237.728271484375, "logps/rejected": -256.4583740234375, "loss": 0.4525, "rewards/accuracies": 0.75, "rewards/chosen": -4.296916961669922, "rewards/margins": 1.5198354721069336, "rewards/rejected": -5.8167524337768555, "step": 2830 }, { "epoch": 0.8385001476232654, "grad_norm": 12.950839042663574, "learning_rate": 1.4021180986123414e-05, "logits/chosen": 1.2725284099578857, "logits/rejected": 1.2577356100082397, "logps/chosen": -242.2948760986328, "logps/rejected": -250.6757049560547, "loss": 0.4871, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -4.731317043304443, "rewards/margins": 1.223638892173767, "rewards/rejected": -5.954955577850342, "step": 2840 }, { "epoch": 0.8414526129317981, "grad_norm": 13.588140487670898, "learning_rate": 1.4020903454384412e-05, "logits/chosen": 1.175161600112915, "logits/rejected": 1.1749106645584106, "logps/chosen": -240.73495483398438, "logps/rejected": -260.48870849609375, "loss": 0.407, "rewards/accuracies": 0.8666667938232422, "rewards/chosen": -4.905536651611328, "rewards/margins": 1.8048782348632812, "rewards/rejected": -6.710413932800293, "step": 2850 }, { "epoch": 0.8444050782403307, "grad_norm": 12.539867401123047, "learning_rate": 1.402062592264541e-05, "logits/chosen": 0.7971091866493225, "logits/rejected": 0.7951844930648804, "logps/chosen": -240.7291717529297, "logps/rejected": -257.8313293457031, "loss": 0.4711, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5.100241661071777, "rewards/margins": 1.59716796875, "rewards/rejected": -6.697409629821777, "step": 2860 }, { "epoch": 0.8473575435488633, "grad_norm": 1.6819536685943604, "learning_rate": 1.4020348390906406e-05, "logits/chosen": 0.9496736526489258, "logits/rejected": 0.9317310452461243, "logps/chosen": -248.58065795898438, "logps/rejected": -270.78106689453125, "loss": 0.3593, "rewards/accuracies": 0.8833333849906921, "rewards/chosen": -5.46395206451416, "rewards/margins": 2.2096076011657715, "rewards/rejected": -7.673558712005615, "step": 2870 }, { "epoch": 0.8503100088573959, "grad_norm": 15.674724578857422, "learning_rate": 1.4020070859167405e-05, "logits/chosen": 0.9865518808364868, "logits/rejected": 0.9733440279960632, "logps/chosen": -257.0057678222656, "logps/rejected": -274.56884765625, "loss": 0.5388, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6.554886817932129, "rewards/margins": 1.5401452779769897, "rewards/rejected": -8.09503173828125, "step": 2880 }, { "epoch": 0.8532624741659286, "grad_norm": 18.558452606201172, "learning_rate": 1.4019793327428404e-05, "logits/chosen": 1.142879605293274, "logits/rejected": 1.1181796789169312, "logps/chosen": -253.58670043945312, "logps/rejected": -276.4182434082031, "loss": 0.3303, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.006396293640137, "rewards/margins": 2.1409544944763184, "rewards/rejected": -8.147351264953613, "step": 2890 }, { "epoch": 0.8562149394744611, "grad_norm": 7.819210529327393, "learning_rate": 1.4019515795689401e-05, "logits/chosen": 0.9120739102363586, "logits/rejected": 0.9137898683547974, "logps/chosen": -258.72076416015625, "logps/rejected": -274.10552978515625, "loss": 0.4594, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -6.799785614013672, "rewards/margins": 1.645308256149292, "rewards/rejected": -8.445094108581543, "step": 2900 }, { "epoch": 0.8591674047829938, "grad_norm": 18.903823852539062, "learning_rate": 1.4019238263950398e-05, "logits/chosen": 0.7846294641494751, "logits/rejected": 0.7610937356948853, "logps/chosen": -283.12701416015625, "logps/rejected": -305.43560791015625, "loss": 0.3833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.653009414672852, "rewards/margins": 2.5519661903381348, "rewards/rejected": -11.204975128173828, "step": 2910 }, { "epoch": 0.8621198700915265, "grad_norm": 16.36449432373047, "learning_rate": 1.4018960732211397e-05, "logits/chosen": 1.0208808183670044, "logits/rejected": 1.0209038257598877, "logps/chosen": -280.11370849609375, "logps/rejected": -297.32623291015625, "loss": 0.6074, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -8.609851837158203, "rewards/margins": 1.6222337484359741, "rewards/rejected": -10.232084274291992, "step": 2920 }, { "epoch": 0.865072335400059, "grad_norm": 5.273636341094971, "learning_rate": 1.4018683200472396e-05, "logits/chosen": 1.1004563570022583, "logits/rejected": 1.0893090963363647, "logps/chosen": -270.9635314941406, "logps/rejected": -287.27703857421875, "loss": 0.5173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.514532566070557, "rewards/margins": 2.0178542137145996, "rewards/rejected": -9.53238582611084, "step": 2930 }, { "epoch": 0.8680248007085917, "grad_norm": 10.385673522949219, "learning_rate": 1.4018405668733393e-05, "logits/chosen": 0.5243192911148071, "logits/rejected": 0.5110400319099426, "logps/chosen": -261.01507568359375, "logps/rejected": -284.13262939453125, "loss": 0.5019, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -7.178466796875, "rewards/margins": 1.833875298500061, "rewards/rejected": -9.01234245300293, "step": 2940 }, { "epoch": 0.8709772660171243, "grad_norm": 9.21269416809082, "learning_rate": 1.401812813699439e-05, "logits/chosen": 0.8334879875183105, "logits/rejected": 0.7947150468826294, "logps/chosen": -258.5113220214844, "logps/rejected": -275.12017822265625, "loss": 0.3189, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -6.284754753112793, "rewards/margins": 2.153109312057495, "rewards/rejected": -8.437864303588867, "step": 2950 }, { "epoch": 0.8739297313256569, "grad_norm": 21.07270050048828, "learning_rate": 1.401785060525539e-05, "logits/chosen": 0.7732642889022827, "logits/rejected": 0.7505866289138794, "logps/chosen": -250.7454071044922, "logps/rejected": -269.18841552734375, "loss": 0.5447, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -5.857220649719238, "rewards/margins": 1.7945804595947266, "rewards/rejected": -7.651802062988281, "step": 2960 }, { "epoch": 0.8768821966341895, "grad_norm": 14.960626602172852, "learning_rate": 1.4017573073516387e-05, "logits/chosen": 1.1897975206375122, "logits/rejected": 1.1994853019714355, "logps/chosen": -244.46926879882812, "logps/rejected": -263.2671203613281, "loss": 0.5026, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5.157473564147949, "rewards/margins": 1.5558032989501953, "rewards/rejected": -6.713277339935303, "step": 2970 }, { "epoch": 0.8798346619427222, "grad_norm": 4.3260416984558105, "learning_rate": 1.4017295541777384e-05, "logits/chosen": 0.9848998785018921, "logits/rejected": 0.9930203557014465, "logps/chosen": -237.6705780029297, "logps/rejected": -246.1083984375, "loss": 0.545, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -4.069361209869385, "rewards/margins": 1.469951868057251, "rewards/rejected": -5.539313316345215, "step": 2980 }, { "epoch": 0.8827871272512547, "grad_norm": 5.920868873596191, "learning_rate": 1.4017018010038381e-05, "logits/chosen": 1.4375402927398682, "logits/rejected": 1.4167811870574951, "logps/chosen": -231.298828125, "logps/rejected": -251.663818359375, "loss": 0.4397, "rewards/accuracies": 0.75, "rewards/chosen": -4.141283988952637, "rewards/margins": 1.554025411605835, "rewards/rejected": -5.695309638977051, "step": 2990 }, { "epoch": 0.8857395925597874, "grad_norm": 14.093681335449219, "learning_rate": 1.4016740478299382e-05, "logits/chosen": 0.7898784279823303, "logits/rejected": 0.7911673784255981, "logps/chosen": -234.6328582763672, "logps/rejected": -245.76187133789062, "loss": 0.47, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4.351687431335449, "rewards/margins": 1.466024398803711, "rewards/rejected": -5.81771183013916, "step": 3000 }, { "epoch": 0.8886920578683201, "grad_norm": 15.90947151184082, "learning_rate": 1.4016462946560379e-05, "logits/chosen": 1.0840799808502197, "logits/rejected": 1.0675323009490967, "logps/chosen": -240.3332061767578, "logps/rejected": -252.4126434326172, "loss": 0.5205, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -4.847250938415527, "rewards/margins": 1.4572839736938477, "rewards/rejected": -6.304534912109375, "step": 3010 }, { "epoch": 0.8916445231768527, "grad_norm": 15.364027976989746, "learning_rate": 1.4016185414821376e-05, "logits/chosen": 1.2139637470245361, "logits/rejected": 1.203858494758606, "logps/chosen": -237.54061889648438, "logps/rejected": -257.0138244628906, "loss": 0.3506, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.517528533935547, "rewards/margins": 1.895896553993225, "rewards/rejected": -6.413424491882324, "step": 3020 }, { "epoch": 0.8945969884853853, "grad_norm": 12.898478507995605, "learning_rate": 1.4015907883082373e-05, "logits/chosen": 1.0201280117034912, "logits/rejected": 1.0226027965545654, "logps/chosen": -242.8017578125, "logps/rejected": -260.9385070800781, "loss": 0.6586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.116764068603516, "rewards/margins": 1.6011621952056885, "rewards/rejected": -6.717926025390625, "step": 3030 }, { "epoch": 0.8975494537939179, "grad_norm": 9.787171363830566, "learning_rate": 1.4015630351343373e-05, "logits/chosen": 1.1525518894195557, "logits/rejected": 1.1269786357879639, "logps/chosen": -238.3458709716797, "logps/rejected": -259.6780090332031, "loss": 0.4296, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -4.6883649826049805, "rewards/margins": 2.0164406299591064, "rewards/rejected": -6.704805850982666, "step": 3040 }, { "epoch": 0.9005019191024506, "grad_norm": 8.329623222351074, "learning_rate": 1.401535281960437e-05, "logits/chosen": 1.0368040800094604, "logits/rejected": 1.0099343061447144, "logps/chosen": -246.0287322998047, "logps/rejected": -264.33453369140625, "loss": 0.3403, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -5.318078994750977, "rewards/margins": 2.229231595993042, "rewards/rejected": -7.547311305999756, "step": 3050 }, { "epoch": 0.9034543844109831, "grad_norm": 12.080394744873047, "learning_rate": 1.4015075287865368e-05, "logits/chosen": 0.9694647789001465, "logits/rejected": 0.9714139103889465, "logps/chosen": -245.89608764648438, "logps/rejected": -267.92388916015625, "loss": 0.3858, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -5.448779106140137, "rewards/margins": 2.0930778980255127, "rewards/rejected": -7.5418572425842285, "step": 3060 }, { "epoch": 0.9064068497195158, "grad_norm": 26.513702392578125, "learning_rate": 1.4014797756126366e-05, "logits/chosen": 1.2117961645126343, "logits/rejected": 1.1758077144622803, "logps/chosen": -257.79022216796875, "logps/rejected": -269.34912109375, "loss": 0.6092, "rewards/accuracies": 0.6833332777023315, "rewards/chosen": -6.196490287780762, "rewards/margins": 1.6729987859725952, "rewards/rejected": -7.869488716125488, "step": 3070 }, { "epoch": 0.9093593150280485, "grad_norm": 19.822019577026367, "learning_rate": 1.4014520224387365e-05, "logits/chosen": 0.9343527555465698, "logits/rejected": 0.9282553791999817, "logps/chosen": -254.22817993164062, "logps/rejected": -270.15716552734375, "loss": 0.4403, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -6.049466133117676, "rewards/margins": 1.775336503982544, "rewards/rejected": -7.824803352355957, "step": 3080 }, { "epoch": 0.912311780336581, "grad_norm": 9.078575134277344, "learning_rate": 1.4014242692648362e-05, "logits/chosen": 1.081786036491394, "logits/rejected": 1.0569143295288086, "logps/chosen": -261.05072021484375, "logps/rejected": -275.80584716796875, "loss": 0.5563, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -6.963624000549316, "rewards/margins": 1.3629686832427979, "rewards/rejected": -8.326592445373535, "step": 3090 }, { "epoch": 0.9152642456451137, "grad_norm": 13.950883865356445, "learning_rate": 1.4013965160909359e-05, "logits/chosen": 0.8515877723693848, "logits/rejected": 0.8364941477775574, "logps/chosen": -259.63726806640625, "logps/rejected": -278.04156494140625, "loss": 0.5814, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -6.349544525146484, "rewards/margins": 1.6570813655853271, "rewards/rejected": -8.00662612915039, "step": 3100 }, { "epoch": 0.9182167109536463, "grad_norm": 3.975935220718384, "learning_rate": 1.4013687629170358e-05, "logits/chosen": 0.7457830309867859, "logits/rejected": 0.7358218431472778, "logps/chosen": -248.9744873046875, "logps/rejected": -269.8999938964844, "loss": 0.3309, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -5.649913311004639, "rewards/margins": 2.0708701610565186, "rewards/rejected": -7.7207841873168945, "step": 3110 }, { "epoch": 0.9211691762621789, "grad_norm": 21.138303756713867, "learning_rate": 1.4013410097431357e-05, "logits/chosen": 0.9502171277999878, "logits/rejected": 0.9545202255249023, "logps/chosen": -245.86355590820312, "logps/rejected": -260.61614990234375, "loss": 0.526, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -5.566435813903809, "rewards/margins": 1.4863369464874268, "rewards/rejected": -7.052772521972656, "step": 3120 }, { "epoch": 0.9241216415707115, "grad_norm": 16.62441062927246, "learning_rate": 1.4013132565692354e-05, "logits/chosen": 0.8942596316337585, "logits/rejected": 0.8870570063591003, "logps/chosen": -249.0651397705078, "logps/rejected": -269.85125732421875, "loss": 0.4024, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -5.77833890914917, "rewards/margins": 1.801588773727417, "rewards/rejected": -7.579927921295166, "step": 3130 }, { "epoch": 0.9270741068792442, "grad_norm": 11.895112037658691, "learning_rate": 1.401285503395335e-05, "logits/chosen": 0.9253729581832886, "logits/rejected": 0.8895522356033325, "logps/chosen": -249.4697723388672, "logps/rejected": -270.6191101074219, "loss": 0.31, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -5.913967609405518, "rewards/margins": 1.8593711853027344, "rewards/rejected": -7.773339748382568, "step": 3140 }, { "epoch": 0.9300265721877768, "grad_norm": 15.00984001159668, "learning_rate": 1.401257750221435e-05, "logits/chosen": 1.228806734085083, "logits/rejected": 1.2069668769836426, "logps/chosen": -254.57373046875, "logps/rejected": -280.9693908691406, "loss": 0.5172, "rewards/accuracies": 0.7500001192092896, "rewards/chosen": -6.643713474273682, "rewards/margins": 1.7999267578125, "rewards/rejected": -8.443639755249023, "step": 3150 }, { "epoch": 0.9329790374963094, "grad_norm": 16.683391571044922, "learning_rate": 1.4012299970475347e-05, "logits/chosen": 0.9733425974845886, "logits/rejected": 0.9506564140319824, "logps/chosen": -264.1029052734375, "logps/rejected": -282.020751953125, "loss": 0.4882, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -7.119184970855713, "rewards/margins": 1.8379993438720703, "rewards/rejected": -8.957183837890625, "step": 3160 }, { "epoch": 0.935931502804842, "grad_norm": 11.276632308959961, "learning_rate": 1.4012022438736345e-05, "logits/chosen": 1.1644847393035889, "logits/rejected": 1.148648977279663, "logps/chosen": -254.95059204101562, "logps/rejected": -276.86065673828125, "loss": 0.4896, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6.630859375, "rewards/margins": 2.0140929222106934, "rewards/rejected": -8.644952774047852, "step": 3170 }, { "epoch": 0.9388839681133747, "grad_norm": 22.761289596557617, "learning_rate": 1.4011744906997344e-05, "logits/chosen": 1.2373731136322021, "logits/rejected": 1.2270545959472656, "logps/chosen": -257.74481201171875, "logps/rejected": -276.2661437988281, "loss": 0.4508, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6.585168361663818, "rewards/margins": 1.84341299533844, "rewards/rejected": -8.428581237792969, "step": 3180 }, { "epoch": 0.9418364334219073, "grad_norm": 20.603103637695312, "learning_rate": 1.4011467375258341e-05, "logits/chosen": 1.04044508934021, "logits/rejected": 1.0198522806167603, "logps/chosen": -241.0797882080078, "logps/rejected": -265.94232177734375, "loss": 0.4086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.131291389465332, "rewards/margins": 2.299708604812622, "rewards/rejected": -7.431000709533691, "step": 3190 }, { "epoch": 0.9447888987304399, "grad_norm": 16.17703628540039, "learning_rate": 1.4011189843519338e-05, "logits/chosen": 0.8929934501647949, "logits/rejected": 0.8735634088516235, "logps/chosen": -259.1670227050781, "logps/rejected": -284.5030212402344, "loss": 0.312, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -6.012137413024902, "rewards/margins": 2.730707883834839, "rewards/rejected": -8.74284553527832, "step": 3200 }, { "epoch": 0.9477413640389726, "grad_norm": 24.91924285888672, "learning_rate": 1.4010912311780337e-05, "logits/chosen": 0.7341073751449585, "logits/rejected": 0.7165102958679199, "logps/chosen": -255.8665313720703, "logps/rejected": -281.0421447753906, "loss": 0.3103, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -6.350571632385254, "rewards/margins": 2.51487398147583, "rewards/rejected": -8.865446090698242, "step": 3210 }, { "epoch": 0.9506938293475051, "grad_norm": 2.3971610069274902, "learning_rate": 1.4010634780041336e-05, "logits/chosen": 0.7332594394683838, "logits/rejected": 0.7224677801132202, "logps/chosen": -262.72308349609375, "logps/rejected": -284.6925354003906, "loss": 0.4883, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -7.268516540527344, "rewards/margins": 2.0014407634735107, "rewards/rejected": -9.269956588745117, "step": 3220 }, { "epoch": 0.9536462946560378, "grad_norm": 21.948976516723633, "learning_rate": 1.4010357248302333e-05, "logits/chosen": 1.110567331314087, "logits/rejected": 1.0986393690109253, "logps/chosen": -262.4955749511719, "logps/rejected": -280.7978820800781, "loss": 0.446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.83743953704834, "rewards/margins": 1.9677139520645142, "rewards/rejected": -8.805153846740723, "step": 3230 }, { "epoch": 0.9565987599645704, "grad_norm": 7.426965236663818, "learning_rate": 1.401007971656333e-05, "logits/chosen": 0.9063795804977417, "logits/rejected": 0.8990973234176636, "logps/chosen": -254.8214111328125, "logps/rejected": -269.732666015625, "loss": 0.4315, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -6.3856096267700195, "rewards/margins": 2.0125324726104736, "rewards/rejected": -8.398141860961914, "step": 3240 }, { "epoch": 0.959551225273103, "grad_norm": 3.2210404872894287, "learning_rate": 1.4009802184824329e-05, "logits/chosen": 1.0932416915893555, "logits/rejected": 1.0791380405426025, "logps/chosen": -250.179931640625, "logps/rejected": -270.9878845214844, "loss": 0.4939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.787269592285156, "rewards/margins": 2.1213865280151367, "rewards/rejected": -7.908656120300293, "step": 3250 }, { "epoch": 0.9625036905816357, "grad_norm": 10.770796775817871, "learning_rate": 1.4009524653085327e-05, "logits/chosen": 1.3158859014511108, "logits/rejected": 1.2794454097747803, "logps/chosen": -249.47726440429688, "logps/rejected": -279.02264404296875, "loss": 0.311, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.358817100524902, "rewards/margins": 3.100614070892334, "rewards/rejected": -8.459430694580078, "step": 3260 }, { "epoch": 0.9654561558901683, "grad_norm": 8.527506828308105, "learning_rate": 1.4009247121346324e-05, "logits/chosen": 0.7765678763389587, "logits/rejected": 0.7660807371139526, "logps/chosen": -253.6894989013672, "logps/rejected": -278.22760009765625, "loss": 0.3204, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.946650505065918, "rewards/margins": 2.5533249378204346, "rewards/rejected": -8.499975204467773, "step": 3270 }, { "epoch": 0.968408621198701, "grad_norm": 16.958982467651367, "learning_rate": 1.4008969589607322e-05, "logits/chosen": 1.2036083936691284, "logits/rejected": 1.1684587001800537, "logps/chosen": -271.6014404296875, "logps/rejected": -287.117431640625, "loss": 0.5068, "rewards/accuracies": 0.75, "rewards/chosen": -7.67684268951416, "rewards/margins": 1.8981605768203735, "rewards/rejected": -9.575002670288086, "step": 3280 }, { "epoch": 0.9713610865072335, "grad_norm": 6.3421101570129395, "learning_rate": 1.4008692057868322e-05, "logits/chosen": 0.7102869749069214, "logits/rejected": 0.7115236520767212, "logps/chosen": -258.9211730957031, "logps/rejected": -279.21600341796875, "loss": 0.4871, "rewards/accuracies": 0.7500001192092896, "rewards/chosen": -6.763546943664551, "rewards/margins": 1.8179051876068115, "rewards/rejected": -8.581453323364258, "step": 3290 }, { "epoch": 0.9743135518157662, "grad_norm": 5.567224502563477, "learning_rate": 1.4008414526129319e-05, "logits/chosen": 0.6543753743171692, "logits/rejected": 0.6532643437385559, "logps/chosen": -244.94039916992188, "logps/rejected": -264.81256103515625, "loss": 0.5238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.246161937713623, "rewards/margins": 1.5368661880493164, "rewards/rejected": -6.783027648925781, "step": 3300 }, { "epoch": 0.9772660171242988, "grad_norm": 14.354612350463867, "learning_rate": 1.4008136994390316e-05, "logits/chosen": 0.8010032773017883, "logits/rejected": 0.7810193300247192, "logps/chosen": -242.9619598388672, "logps/rejected": -270.2242736816406, "loss": 0.3638, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.514560222625732, "rewards/margins": 2.418081045150757, "rewards/rejected": -6.932641506195068, "step": 3310 }, { "epoch": 0.9802184824328314, "grad_norm": 19.21661949157715, "learning_rate": 1.4007859462651313e-05, "logits/chosen": 0.8700664639472961, "logits/rejected": 0.8646572828292847, "logps/chosen": -242.8661651611328, "logps/rejected": -264.07098388671875, "loss": 0.3431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.078057289123535, "rewards/margins": 2.2047085762023926, "rewards/rejected": -7.282765865325928, "step": 3320 }, { "epoch": 0.983170947741364, "grad_norm": 5.330258369445801, "learning_rate": 1.4007581930912314e-05, "logits/chosen": 0.9477923512458801, "logits/rejected": 0.9462806582450867, "logps/chosen": -252.0956268310547, "logps/rejected": -269.7242126464844, "loss": 0.3768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.58902645111084, "rewards/margins": 2.073425769805908, "rewards/rejected": -7.662452697753906, "step": 3330 }, { "epoch": 0.9861234130498967, "grad_norm": 5.401336669921875, "learning_rate": 1.400730439917331e-05, "logits/chosen": 0.6821188926696777, "logits/rejected": 0.6770846247673035, "logps/chosen": -258.6656799316406, "logps/rejected": -284.0771484375, "loss": 0.3907, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -6.481507778167725, "rewards/margins": 1.8569139242172241, "rewards/rejected": -8.338422775268555, "step": 3340 }, { "epoch": 0.9890758783584293, "grad_norm": 16.365318298339844, "learning_rate": 1.4007026867434308e-05, "logits/chosen": 0.7911583185195923, "logits/rejected": 0.7768401503562927, "logps/chosen": -264.9526062011719, "logps/rejected": -284.7171936035156, "loss": 0.4879, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -6.958296298980713, "rewards/margins": 1.754610300064087, "rewards/rejected": -8.712905883789062, "step": 3350 }, { "epoch": 0.9920283436669619, "grad_norm": 16.361148834228516, "learning_rate": 1.4006749335695305e-05, "logits/chosen": 0.8216841816902161, "logits/rejected": 0.7941895127296448, "logps/chosen": -265.3275146484375, "logps/rejected": -289.08807373046875, "loss": 0.391, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -7.497669219970703, "rewards/margins": 2.124753952026367, "rewards/rejected": -9.62242317199707, "step": 3360 }, { "epoch": 0.9949808089754946, "grad_norm": 9.442657470703125, "learning_rate": 1.4006471803956305e-05, "logits/chosen": 0.5929777026176453, "logits/rejected": 0.5803393125534058, "logps/chosen": -258.7288513183594, "logps/rejected": -285.96734619140625, "loss": 0.3905, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -6.983333587646484, "rewards/margins": 2.128347635269165, "rewards/rejected": -9.11168098449707, "step": 3370 }, { "epoch": 0.9979332742840271, "grad_norm": 16.531021118164062, "learning_rate": 1.4006194272217302e-05, "logits/chosen": 0.5771923661231995, "logits/rejected": 0.5456576347351074, "logps/chosen": -266.9584045410156, "logps/rejected": -291.78753662109375, "loss": 0.5573, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -7.553260803222656, "rewards/margins": 1.9732072353363037, "rewards/rejected": -9.526466369628906, "step": 3380 }, { "epoch": 1.0008857395925599, "grad_norm": 3.3778305053710938, "learning_rate": 1.40059167404783e-05, "logits/chosen": 0.7008672952651978, "logits/rejected": 0.6697190999984741, "logps/chosen": -263.03057861328125, "logps/rejected": -281.4614562988281, "loss": 0.3614, "rewards/accuracies": 0.8166666030883789, "rewards/chosen": -7.416905403137207, "rewards/margins": 2.016350507736206, "rewards/rejected": -9.433256149291992, "step": 3390 }, { "epoch": 1.0038382049010923, "grad_norm": 14.1766357421875, "learning_rate": 1.4005639208739298e-05, "logits/chosen": 0.7554770708084106, "logits/rejected": 0.742950975894928, "logps/chosen": -271.34716796875, "logps/rejected": -297.9550476074219, "loss": 0.5209, "rewards/accuracies": 0.783333420753479, "rewards/chosen": -7.588138580322266, "rewards/margins": 2.0761444568634033, "rewards/rejected": -9.66428279876709, "step": 3400 }, { "epoch": 1.006790670209625, "grad_norm": 16.774288177490234, "learning_rate": 1.4005361677000297e-05, "logits/chosen": 0.8897944688796997, "logits/rejected": 0.869391143321991, "logps/chosen": -259.3974914550781, "logps/rejected": -281.4565734863281, "loss": 0.3712, "rewards/accuracies": 0.8166666030883789, "rewards/chosen": -6.738577842712402, "rewards/margins": 2.168720006942749, "rewards/rejected": -8.90729808807373, "step": 3410 }, { "epoch": 1.0097431355181576, "grad_norm": 11.555708885192871, "learning_rate": 1.4005084145261294e-05, "logits/chosen": 0.9507796168327332, "logits/rejected": 0.9205015897750854, "logps/chosen": -250.6663360595703, "logps/rejected": -268.1684265136719, "loss": 0.4143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.782897472381592, "rewards/margins": 2.199159622192383, "rewards/rejected": -7.982057094573975, "step": 3420 }, { "epoch": 1.0126956008266903, "grad_norm": 12.427129745483398, "learning_rate": 1.4004806613522291e-05, "logits/chosen": 1.0481797456741333, "logits/rejected": 1.036689043045044, "logps/chosen": -247.54476928710938, "logps/rejected": -261.5200500488281, "loss": 0.3718, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -5.232061386108398, "rewards/margins": 1.8601652383804321, "rewards/rejected": -7.092226505279541, "step": 3430 }, { "epoch": 1.015648066135223, "grad_norm": 8.461666107177734, "learning_rate": 1.400452908178329e-05, "logits/chosen": 0.7249832153320312, "logits/rejected": 0.7009894847869873, "logps/chosen": -246.6259765625, "logps/rejected": -269.25482177734375, "loss": 0.2104, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": -5.3838582038879395, "rewards/margins": 2.751004457473755, "rewards/rejected": -8.134862899780273, "step": 3440 }, { "epoch": 1.0186005314437556, "grad_norm": 10.531888961791992, "learning_rate": 1.4004251550044289e-05, "logits/chosen": 0.9848181009292603, "logits/rejected": 0.9584924578666687, "logps/chosen": -256.73516845703125, "logps/rejected": -284.3894348144531, "loss": 0.2465, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -6.174295902252197, "rewards/margins": 2.834613561630249, "rewards/rejected": -9.008909225463867, "step": 3450 }, { "epoch": 1.021552996752288, "grad_norm": 19.498708724975586, "learning_rate": 1.4003974018305286e-05, "logits/chosen": 0.7478333711624146, "logits/rejected": 0.7430461645126343, "logps/chosen": -253.4674835205078, "logps/rejected": -280.6539001464844, "loss": 0.5049, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -6.5148210525512695, "rewards/margins": 2.121018886566162, "rewards/rejected": -8.63584041595459, "step": 3460 }, { "epoch": 1.0245054620608207, "grad_norm": 8.379602432250977, "learning_rate": 1.4003696486566283e-05, "logits/chosen": 0.872502326965332, "logits/rejected": 0.8434234857559204, "logps/chosen": -261.23162841796875, "logps/rejected": -282.51287841796875, "loss": 0.3814, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -6.6103973388671875, "rewards/margins": 2.3487401008605957, "rewards/rejected": -8.959137916564941, "step": 3470 }, { "epoch": 1.0274579273693534, "grad_norm": 43.77033996582031, "learning_rate": 1.4003418954827281e-05, "logits/chosen": 0.6371831297874451, "logits/rejected": 0.5926028490066528, "logps/chosen": -256.0489196777344, "logps/rejected": -281.2890625, "loss": 0.373, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -6.5200629234313965, "rewards/margins": 2.33754301071167, "rewards/rejected": -8.857605934143066, "step": 3480 }, { "epoch": 1.030410392677886, "grad_norm": 29.729633331298828, "learning_rate": 1.4003141423088278e-05, "logits/chosen": 0.5511474609375, "logits/rejected": 0.5611943006515503, "logps/chosen": -262.53875732421875, "logps/rejected": -285.2470397949219, "loss": 0.3858, "rewards/accuracies": 0.8666667938232422, "rewards/chosen": -6.937355995178223, "rewards/margins": 2.6559646129608154, "rewards/rejected": -9.593320846557617, "step": 3490 }, { "epoch": 1.0333628579864187, "grad_norm": 25.957687377929688, "learning_rate": 1.4002863891349277e-05, "logits/chosen": 0.5613692998886108, "logits/rejected": 0.5420530438423157, "logps/chosen": -267.31402587890625, "logps/rejected": -295.7972412109375, "loss": 0.3344, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.485939025878906, "rewards/margins": 2.6002955436706543, "rewards/rejected": -10.086235046386719, "step": 3500 }, { "epoch": 1.0363153232949514, "grad_norm": 12.77514362335205, "learning_rate": 1.4002586359610276e-05, "logits/chosen": 0.7775045037269592, "logits/rejected": 0.730430543422699, "logps/chosen": -279.873291015625, "logps/rejected": -306.6976318359375, "loss": 0.3492, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -8.578255653381348, "rewards/margins": 2.7563068866729736, "rewards/rejected": -11.334562301635742, "step": 3510 }, { "epoch": 1.039267788603484, "grad_norm": 6.553199291229248, "learning_rate": 1.4002308827871273e-05, "logits/chosen": 0.5426122546195984, "logits/rejected": 0.5224928259849548, "logps/chosen": -274.1863708496094, "logps/rejected": -300.796142578125, "loss": 0.3906, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -7.880624294281006, "rewards/margins": 2.613147735595703, "rewards/rejected": -10.493772506713867, "step": 3520 }, { "epoch": 1.0422202539120164, "grad_norm": 23.301095962524414, "learning_rate": 1.400203129613227e-05, "logits/chosen": 0.6987696886062622, "logits/rejected": 0.6842071413993835, "logps/chosen": -269.9448547363281, "logps/rejected": -293.10595703125, "loss": 0.3839, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -8.097990989685059, "rewards/margins": 2.244403839111328, "rewards/rejected": -10.342394828796387, "step": 3530 }, { "epoch": 1.045172719220549, "grad_norm": 14.4619722366333, "learning_rate": 1.4001753764393269e-05, "logits/chosen": 0.8183805346488953, "logits/rejected": 0.7927323579788208, "logps/chosen": -272.22137451171875, "logps/rejected": -301.08111572265625, "loss": 0.3712, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -8.200728416442871, "rewards/margins": 2.497093439102173, "rewards/rejected": -10.697820663452148, "step": 3540 }, { "epoch": 1.0481251845290818, "grad_norm": 18.19673728942871, "learning_rate": 1.4001476232654268e-05, "logits/chosen": 0.734916090965271, "logits/rejected": 0.7057896852493286, "logps/chosen": -273.89141845703125, "logps/rejected": -292.6280822753906, "loss": 0.5848, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -8.124327659606934, "rewards/margins": 2.122593402862549, "rewards/rejected": -10.246919631958008, "step": 3550 }, { "epoch": 1.0510776498376144, "grad_norm": 9.27344036102295, "learning_rate": 1.4001198700915265e-05, "logits/chosen": 0.6766483187675476, "logits/rejected": 0.6499351859092712, "logps/chosen": -265.4984130859375, "logps/rejected": -290.6417541503906, "loss": 0.4275, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.651116371154785, "rewards/margins": 1.9769153594970703, "rewards/rejected": -9.628030776977539, "step": 3560 }, { "epoch": 1.054030115146147, "grad_norm": 14.652764320373535, "learning_rate": 1.4000921169176262e-05, "logits/chosen": 0.9536673426628113, "logits/rejected": 0.9396756887435913, "logps/chosen": -265.05340576171875, "logps/rejected": -286.4242248535156, "loss": 0.4025, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -7.042543888092041, "rewards/margins": 1.995811104774475, "rewards/rejected": -9.038354873657227, "step": 3570 }, { "epoch": 1.0569825804546797, "grad_norm": 7.574506759643555, "learning_rate": 1.400064363743726e-05, "logits/chosen": 0.9758737683296204, "logits/rejected": 0.9503101110458374, "logps/chosen": -256.7453918457031, "logps/rejected": -286.32891845703125, "loss": 0.2737, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -6.654661655426025, "rewards/margins": 2.6750776767730713, "rewards/rejected": -9.32973861694336, "step": 3580 }, { "epoch": 1.0599350457632122, "grad_norm": 3.9293437004089355, "learning_rate": 1.400036610569826e-05, "logits/chosen": 0.6547101736068726, "logits/rejected": 0.6432870626449585, "logps/chosen": -253.1500244140625, "logps/rejected": -278.51348876953125, "loss": 0.2953, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.397459983825684, "rewards/margins": 2.286984920501709, "rewards/rejected": -8.684444427490234, "step": 3590 }, { "epoch": 1.0628875110717448, "grad_norm": 3.543229818344116, "learning_rate": 1.4000088573959256e-05, "logits/chosen": 1.0167690515518188, "logits/rejected": 0.9863625764846802, "logps/chosen": -252.7490692138672, "logps/rejected": -293.04644775390625, "loss": 0.2614, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": -6.469398498535156, "rewards/margins": 3.465914249420166, "rewards/rejected": -9.935312271118164, "step": 3600 }, { "epoch": 1.0658399763802775, "grad_norm": 7.259729862213135, "learning_rate": 1.3999811042220253e-05, "logits/chosen": 1.2569879293441772, "logits/rejected": 1.2082161903381348, "logps/chosen": -262.9702453613281, "logps/rejected": -294.75677490234375, "loss": 0.2488, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -6.813244819641113, "rewards/margins": 2.9294795989990234, "rewards/rejected": -9.742724418640137, "step": 3610 }, { "epoch": 1.0687924416888102, "grad_norm": 29.099077224731445, "learning_rate": 1.3999533510481252e-05, "logits/chosen": 0.5352407097816467, "logits/rejected": 0.5032047629356384, "logps/chosen": -277.5157775878906, "logps/rejected": -311.57366943359375, "loss": 0.4875, "rewards/accuracies": 0.8000000715255737, "rewards/chosen": -8.328255653381348, "rewards/margins": 3.0951685905456543, "rewards/rejected": -11.42342472076416, "step": 3620 }, { "epoch": 1.0717449069973428, "grad_norm": 19.440019607543945, "learning_rate": 1.3999255978742251e-05, "logits/chosen": 0.7902118563652039, "logits/rejected": 0.7752079963684082, "logps/chosen": -280.151123046875, "logps/rejected": -308.10430908203125, "loss": 0.3814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.614106178283691, "rewards/margins": 2.7576904296875, "rewards/rejected": -11.371795654296875, "step": 3630 }, { "epoch": 1.0746973723058755, "grad_norm": 10.623760223388672, "learning_rate": 1.3998978447003248e-05, "logits/chosen": 0.7884745597839355, "logits/rejected": 0.7698397636413574, "logps/chosen": -281.9467468261719, "logps/rejected": -308.5909423828125, "loss": 0.3261, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.611579895019531, "rewards/margins": 3.0511250495910645, "rewards/rejected": -11.66270637512207, "step": 3640 }, { "epoch": 1.0776498376144081, "grad_norm": 9.272233009338379, "learning_rate": 1.3998700915264245e-05, "logits/chosen": 0.9923402667045593, "logits/rejected": 0.9639446139335632, "logps/chosen": -290.9359436035156, "logps/rejected": -321.0301818847656, "loss": 0.251, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -9.386919021606445, "rewards/margins": 3.2693705558776855, "rewards/rejected": -12.656289100646973, "step": 3650 }, { "epoch": 1.0806023029229406, "grad_norm": 7.964924335479736, "learning_rate": 1.3998423383525245e-05, "logits/chosen": 0.7736841440200806, "logits/rejected": 0.7498350143432617, "logps/chosen": -294.43341064453125, "logps/rejected": -324.8628845214844, "loss": 0.3654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.913439750671387, "rewards/margins": 2.9243063926696777, "rewards/rejected": -12.837747573852539, "step": 3660 }, { "epoch": 1.0835547682314732, "grad_norm": 2.2966954708099365, "learning_rate": 1.3998145851786243e-05, "logits/chosen": 0.8649827241897583, "logits/rejected": 0.8305040597915649, "logps/chosen": -286.05828857421875, "logps/rejected": -322.74676513671875, "loss": 0.3885, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.688798904418945, "rewards/margins": 3.3092703819274902, "rewards/rejected": -12.998067855834961, "step": 3670 }, { "epoch": 1.086507233540006, "grad_norm": 6.931310653686523, "learning_rate": 1.399786832004724e-05, "logits/chosen": 0.7858456373214722, "logits/rejected": 0.7615053653717041, "logps/chosen": -298.71258544921875, "logps/rejected": -327.73809814453125, "loss": 0.5123, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -10.665101051330566, "rewards/margins": 2.4329681396484375, "rewards/rejected": -13.09807014465332, "step": 3680 }, { "epoch": 1.0894596988485385, "grad_norm": 17.075593948364258, "learning_rate": 1.3997590788308237e-05, "logits/chosen": 0.8955463171005249, "logits/rejected": 0.8496713638305664, "logps/chosen": -287.8481750488281, "logps/rejected": -315.814208984375, "loss": 0.3525, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.60830307006836, "rewards/margins": 2.767622709274292, "rewards/rejected": -12.37592601776123, "step": 3690 }, { "epoch": 1.0924121641570712, "grad_norm": 9.325746536254883, "learning_rate": 1.3997313256569237e-05, "logits/chosen": 0.9554599523544312, "logits/rejected": 0.9211385846138, "logps/chosen": -283.80987548828125, "logps/rejected": -315.113037109375, "loss": 0.315, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -9.406264305114746, "rewards/margins": 2.773836851119995, "rewards/rejected": -12.18010139465332, "step": 3700 }, { "epoch": 1.0953646294656039, "grad_norm": 18.465246200561523, "learning_rate": 1.3997035724830234e-05, "logits/chosen": 0.8363810777664185, "logits/rejected": 0.8065091967582703, "logps/chosen": -278.014404296875, "logps/rejected": -307.4988708496094, "loss": 0.3306, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -8.599360466003418, "rewards/margins": 2.8611056804656982, "rewards/rejected": -11.460467338562012, "step": 3710 }, { "epoch": 1.0983170947741363, "grad_norm": 6.114569664001465, "learning_rate": 1.3996758193091231e-05, "logits/chosen": 0.6249672174453735, "logits/rejected": 0.6118178963661194, "logps/chosen": -270.2017517089844, "logps/rejected": -295.0705261230469, "loss": 0.2919, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -7.991025447845459, "rewards/margins": 2.539654493331909, "rewards/rejected": -10.530680656433105, "step": 3720 }, { "epoch": 1.101269560082669, "grad_norm": 23.41162109375, "learning_rate": 1.399648066135223e-05, "logits/chosen": 0.6955439448356628, "logits/rejected": 0.6551072597503662, "logps/chosen": -272.327392578125, "logps/rejected": -298.0369567871094, "loss": 0.3298, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -7.58107852935791, "rewards/margins": 2.803967237472534, "rewards/rejected": -10.385046005249023, "step": 3730 }, { "epoch": 1.1042220253912016, "grad_norm": 9.33141803741455, "learning_rate": 1.3996203129613229e-05, "logits/chosen": 0.7303314208984375, "logits/rejected": 0.691876232624054, "logps/chosen": -264.88629150390625, "logps/rejected": -291.79571533203125, "loss": 0.4399, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -7.413443565368652, "rewards/margins": 3.014366388320923, "rewards/rejected": -10.42780876159668, "step": 3740 }, { "epoch": 1.1071744906997343, "grad_norm": 13.858841896057129, "learning_rate": 1.3995925597874226e-05, "logits/chosen": 0.8649295568466187, "logits/rejected": 0.8267232179641724, "logps/chosen": -265.09600830078125, "logps/rejected": -291.0889892578125, "loss": 0.4188, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -7.17959451675415, "rewards/margins": 2.4509432315826416, "rewards/rejected": -9.630538940429688, "step": 3750 }, { "epoch": 1.110126956008267, "grad_norm": 15.493797302246094, "learning_rate": 1.3995648066135223e-05, "logits/chosen": 0.582310140132904, "logits/rejected": 0.5597119331359863, "logps/chosen": -255.0871124267578, "logps/rejected": -288.869384765625, "loss": 0.2364, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -6.659852027893066, "rewards/margins": 2.853518009185791, "rewards/rejected": -9.513370513916016, "step": 3760 }, { "epoch": 1.1130794213167996, "grad_norm": 24.494205474853516, "learning_rate": 1.3995370534396222e-05, "logits/chosen": 0.661158561706543, "logits/rejected": 0.6094750165939331, "logps/chosen": -260.8846435546875, "logps/rejected": -292.5269775390625, "loss": 0.4072, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -7.068105220794678, "rewards/margins": 2.5036251544952393, "rewards/rejected": -9.571730613708496, "step": 3770 }, { "epoch": 1.1160318866253323, "grad_norm": 18.147441864013672, "learning_rate": 1.3995093002657219e-05, "logits/chosen": 0.7106090784072876, "logits/rejected": 0.6881116628646851, "logps/chosen": -269.50567626953125, "logps/rejected": -301.60528564453125, "loss": 0.4511, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -8.177910804748535, "rewards/margins": 2.6084961891174316, "rewards/rejected": -10.786405563354492, "step": 3780 }, { "epoch": 1.1189843519338647, "grad_norm": 20.83672332763672, "learning_rate": 1.3994815470918217e-05, "logits/chosen": 0.7129969596862793, "logits/rejected": 0.6712583303451538, "logps/chosen": -271.4978942871094, "logps/rejected": -293.49871826171875, "loss": 0.2888, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -7.923611640930176, "rewards/margins": 2.8534398078918457, "rewards/rejected": -10.777050971984863, "step": 3790 }, { "epoch": 1.1219368172423974, "grad_norm": 12.68443489074707, "learning_rate": 1.3994537939179214e-05, "logits/chosen": 0.7308415174484253, "logits/rejected": 0.7006453275680542, "logps/chosen": -276.18402099609375, "logps/rejected": -305.8212585449219, "loss": 0.4041, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -8.528311729431152, "rewards/margins": 2.9342427253723145, "rewards/rejected": -11.462553024291992, "step": 3800 }, { "epoch": 1.12488928255093, "grad_norm": 14.239157676696777, "learning_rate": 1.3994260407440213e-05, "logits/chosen": 0.994149386882782, "logits/rejected": 0.9565375447273254, "logps/chosen": -273.99285888671875, "logps/rejected": -301.91314697265625, "loss": 0.2881, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -7.946053981781006, "rewards/margins": 2.7714293003082275, "rewards/rejected": -10.717483520507812, "step": 3810 }, { "epoch": 1.1278417478594627, "grad_norm": 29.298887252807617, "learning_rate": 1.399398287570121e-05, "logits/chosen": 0.8881340026855469, "logits/rejected": 0.8327795267105103, "logps/chosen": -277.9426574707031, "logps/rejected": -307.6393127441406, "loss": 0.3589, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -8.531688690185547, "rewards/margins": 2.891892433166504, "rewards/rejected": -11.42358112335205, "step": 3820 }, { "epoch": 1.1307942131679953, "grad_norm": 14.629834175109863, "learning_rate": 1.3993705343962209e-05, "logits/chosen": 0.7181685566902161, "logits/rejected": 0.6828103065490723, "logps/chosen": -276.6257019042969, "logps/rejected": -310.322265625, "loss": 0.3148, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -8.42992115020752, "rewards/margins": 3.0532734394073486, "rewards/rejected": -11.483194351196289, "step": 3830 }, { "epoch": 1.133746678476528, "grad_norm": 9.980766296386719, "learning_rate": 1.3993427812223208e-05, "logits/chosen": 0.9479113817214966, "logits/rejected": 0.9252488017082214, "logps/chosen": -264.0487365722656, "logps/rejected": -296.57269287109375, "loss": 0.3237, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.168757438659668, "rewards/margins": 3.1169257164001465, "rewards/rejected": -10.285683631896973, "step": 3840 }, { "epoch": 1.1366991437850604, "grad_norm": 15.918347358703613, "learning_rate": 1.3993150280484205e-05, "logits/chosen": 0.6453667879104614, "logits/rejected": 0.6318212747573853, "logps/chosen": -269.149169921875, "logps/rejected": -302.33319091796875, "loss": 0.3121, "rewards/accuracies": 0.8833333849906921, "rewards/chosen": -7.608902931213379, "rewards/margins": 2.790874719619751, "rewards/rejected": -10.39977741241455, "step": 3850 }, { "epoch": 1.139651609093593, "grad_norm": 7.648816108703613, "learning_rate": 1.3992872748745202e-05, "logits/chosen": 0.9600270390510559, "logits/rejected": 0.9377667307853699, "logps/chosen": -266.99676513671875, "logps/rejected": -290.70416259765625, "loss": 0.302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.164715766906738, "rewards/margins": 3.054461717605591, "rewards/rejected": -10.21917724609375, "step": 3860 }, { "epoch": 1.1426040744021257, "grad_norm": 5.863312244415283, "learning_rate": 1.39925952170062e-05, "logits/chosen": 0.6131603121757507, "logits/rejected": 0.5570327043533325, "logps/chosen": -270.4508361816406, "logps/rejected": -306.5147399902344, "loss": 0.2976, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.99780797958374, "rewards/margins": 3.3482346534729004, "rewards/rejected": -11.34604263305664, "step": 3870 }, { "epoch": 1.1455565397106584, "grad_norm": 17.68100357055664, "learning_rate": 1.39923176852672e-05, "logits/chosen": 0.7149556279182434, "logits/rejected": 0.669957160949707, "logps/chosen": -277.15826416015625, "logps/rejected": -299.9850158691406, "loss": 0.4496, "rewards/accuracies": 0.8166667819023132, "rewards/chosen": -8.450432777404785, "rewards/margins": 2.4499781131744385, "rewards/rejected": -10.900410652160645, "step": 3880 }, { "epoch": 1.148509005019191, "grad_norm": 3.35732364654541, "learning_rate": 1.3992040153528197e-05, "logits/chosen": 0.611181378364563, "logits/rejected": 0.5759787559509277, "logps/chosen": -268.32012939453125, "logps/rejected": -299.57781982421875, "loss": 0.249, "rewards/accuracies": 0.9166668057441711, "rewards/chosen": -7.467983245849609, "rewards/margins": 3.1541714668273926, "rewards/rejected": -10.62215518951416, "step": 3890 }, { "epoch": 1.1514614703277237, "grad_norm": 18.805973052978516, "learning_rate": 1.3991762621789194e-05, "logits/chosen": 0.7761452198028564, "logits/rejected": 0.7680237293243408, "logps/chosen": -270.04351806640625, "logps/rejected": -301.1358947753906, "loss": 0.509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.859975337982178, "rewards/margins": 2.686856508255005, "rewards/rejected": -10.546831130981445, "step": 3900 }, { "epoch": 1.1544139356362564, "grad_norm": 13.84335994720459, "learning_rate": 1.3991485090050192e-05, "logits/chosen": 0.6409298181533813, "logits/rejected": 0.6187285780906677, "logps/chosen": -261.5778503417969, "logps/rejected": -282.0768127441406, "loss": 0.4424, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -7.050593376159668, "rewards/margins": 2.229386806488037, "rewards/rejected": -9.279979705810547, "step": 3910 }, { "epoch": 1.1573664009447888, "grad_norm": 20.416505813598633, "learning_rate": 1.3991207558311191e-05, "logits/chosen": 0.7801686525344849, "logits/rejected": 0.7607689499855042, "logps/chosen": -264.01220703125, "logps/rejected": -287.47857666015625, "loss": 0.3394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.025997161865234, "rewards/margins": 2.635050058364868, "rewards/rejected": -9.661046981811523, "step": 3920 }, { "epoch": 1.1603188662533215, "grad_norm": 13.749695777893066, "learning_rate": 1.3990930026572188e-05, "logits/chosen": 0.9971164464950562, "logits/rejected": 0.9663597941398621, "logps/chosen": -261.5372314453125, "logps/rejected": -288.4346618652344, "loss": 0.2127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.795307159423828, "rewards/margins": 3.1057522296905518, "rewards/rejected": -9.901060104370117, "step": 3930 }, { "epoch": 1.1632713315618541, "grad_norm": 11.176308631896973, "learning_rate": 1.3990652494833185e-05, "logits/chosen": 1.0405288934707642, "logits/rejected": 1.003983974456787, "logps/chosen": -264.8949279785156, "logps/rejected": -300.75677490234375, "loss": 0.2334, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -7.5243425369262695, "rewards/margins": 3.400277614593506, "rewards/rejected": -10.9246187210083, "step": 3940 }, { "epoch": 1.1662237968703868, "grad_norm": 7.1926589012146, "learning_rate": 1.3990374963094184e-05, "logits/chosen": 1.0745716094970703, "logits/rejected": 1.0315403938293457, "logps/chosen": -264.5164794921875, "logps/rejected": -293.4964599609375, "loss": 0.2866, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -7.391366481781006, "rewards/margins": 2.717827796936035, "rewards/rejected": -10.109195709228516, "step": 3950 }, { "epoch": 1.1691762621789195, "grad_norm": 14.957414627075195, "learning_rate": 1.3990097431355183e-05, "logits/chosen": 0.5332013964653015, "logits/rejected": 0.5057114362716675, "logps/chosen": -277.15496826171875, "logps/rejected": -304.8959045410156, "loss": 0.3798, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -8.614336967468262, "rewards/margins": 2.497170925140381, "rewards/rejected": -11.111506462097168, "step": 3960 }, { "epoch": 1.1721287274874521, "grad_norm": 22.063352584838867, "learning_rate": 1.398981989961618e-05, "logits/chosen": 0.8523737192153931, "logits/rejected": 0.8229592442512512, "logps/chosen": -271.50518798828125, "logps/rejected": -300.90289306640625, "loss": 0.4968, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -8.212048530578613, "rewards/margins": 2.500626802444458, "rewards/rejected": -10.712674140930176, "step": 3970 }, { "epoch": 1.1750811927959846, "grad_norm": 18.313377380371094, "learning_rate": 1.3989542367877177e-05, "logits/chosen": 0.8125611543655396, "logits/rejected": 0.7557014226913452, "logps/chosen": -276.20281982421875, "logps/rejected": -297.1690368652344, "loss": 0.4685, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -8.621707916259766, "rewards/margins": 2.158404588699341, "rewards/rejected": -10.780112266540527, "step": 3980 }, { "epoch": 1.1780336581045172, "grad_norm": 10.004650115966797, "learning_rate": 1.3989264836138177e-05, "logits/chosen": 0.5399643778800964, "logits/rejected": 0.5120288133621216, "logps/chosen": -263.34515380859375, "logps/rejected": -292.05633544921875, "loss": 0.3404, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -7.483403205871582, "rewards/margins": 2.488957166671753, "rewards/rejected": -9.972360610961914, "step": 3990 }, { "epoch": 1.1809861234130499, "grad_norm": 7.784177780151367, "learning_rate": 1.3988987304399174e-05, "logits/chosen": 0.7904013395309448, "logits/rejected": 0.7233549952507019, "logps/chosen": -277.74884033203125, "logps/rejected": -306.0140075683594, "loss": 0.3301, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.961581230163574, "rewards/margins": 3.11561918258667, "rewards/rejected": -11.077199935913086, "step": 4000 } ], "logging_steps": 10, "max_steps": 508050, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }