diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23247 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999183606825047, + "eval_steps": 100, + "global_step": 1531, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000653114539962446, + "grad_norm": 10.320163557955956, + "learning_rate": 9.74025974025974e-10, + "logits/chosen": -1.751611590385437, + "logits/rejected": -1.8014392852783203, + "logps/chosen": -475.2503662109375, + "logps/rejected": -473.9908752441406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.001306229079924892, + "grad_norm": 9.57458924359402, + "learning_rate": 1.948051948051948e-09, + "logits/chosen": -1.6125147342681885, + "logits/rejected": -1.6155394315719604, + "logps/chosen": -510.954833984375, + "logps/rejected": -456.25836181640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0019593436198873378, + "grad_norm": 7.386703525361656, + "learning_rate": 2.9220779220779217e-09, + "logits/chosen": -1.6310054063796997, + "logits/rejected": -1.664994239807129, + "logps/chosen": -526.9005737304688, + "logps/rejected": -502.1066589355469, + "loss": 0.6936, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0020557926036417484, + "rewards/margins": -0.0028404928743839264, + "rewards/rejected": 0.0007846998050808907, + "step": 3 + }, + { + "epoch": 0.002612458159849784, + "grad_norm": 8.184972062975513, + "learning_rate": 3.896103896103896e-09, + "logits/chosen": -1.6248514652252197, + "logits/rejected": -1.6408921480178833, + "logps/chosen": -452.23651123046875, + "logps/rejected": -464.2245178222656, + "loss": 0.6927, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0005986166652292013, + "rewards/margins": 0.0015139534370973706, + "rewards/rejected": -0.0009153365390375257, + "step": 4 + }, + { + "epoch": 0.0032655726998122294, + "grad_norm": 11.574533999877016, + "learning_rate": 4.8701298701298695e-09, + "logits/chosen": -1.7286796569824219, + "logits/rejected": -1.7585710287094116, + "logps/chosen": -462.3341979980469, + "logps/rejected": -441.8200378417969, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00044421182246878743, + "rewards/margins": 0.00013704760931432247, + "rewards/rejected": -0.0005812598392367363, + "step": 5 + }, + { + "epoch": 0.0039186872397746755, + "grad_norm": 9.259170331718803, + "learning_rate": 5.844155844155843e-09, + "logits/chosen": -1.7418220043182373, + "logits/rejected": -1.7892532348632812, + "logps/chosen": -522.937744140625, + "logps/rejected": -552.8088989257812, + "loss": 0.6929, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0012052917154505849, + "rewards/margins": -0.002461440395563841, + "rewards/rejected": 0.0012561489129438996, + "step": 6 + }, + { + "epoch": 0.0045718017797371216, + "grad_norm": 7.63789930024362, + "learning_rate": 6.818181818181818e-09, + "logits/chosen": -1.7751970291137695, + "logits/rejected": -1.8038901090621948, + "logps/chosen": -533.4199829101562, + "logps/rejected": -480.9632263183594, + "loss": 0.6925, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0024563027545809746, + "rewards/margins": 0.0012150906259194016, + "rewards/rejected": 0.0012412117794156075, + "step": 7 + }, + { + "epoch": 0.005224916319699568, + "grad_norm": 9.961517065455444, + "learning_rate": 7.792207792207793e-09, + "logits/chosen": -1.7411315441131592, + "logits/rejected": -1.754071593284607, + "logps/chosen": -495.657470703125, + "logps/rejected": -475.14959716796875, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00112921220716089, + "rewards/margins": -0.0019524528179317713, + "rewards/rejected": 0.0008232402033172548, + "step": 8 + }, + { + "epoch": 0.005878030859662013, + "grad_norm": 6.85854758907971, + "learning_rate": 8.766233766233765e-09, + "logits/chosen": -1.6711037158966064, + "logits/rejected": -1.7036432027816772, + "logps/chosen": -506.71331787109375, + "logps/rejected": -518.3823852539062, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.00047096985508687794, + "rewards/margins": -0.0014907550066709518, + "rewards/rejected": 0.0019617248326539993, + "step": 9 + }, + { + "epoch": 0.006531145399624459, + "grad_norm": 11.552115994362305, + "learning_rate": 9.740259740259739e-09, + "logits/chosen": -1.6863927841186523, + "logits/rejected": -1.6270835399627686, + "logps/chosen": -527.3156127929688, + "logps/rejected": -595.2274169921875, + "loss": 0.6923, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0007424141513183713, + "rewards/margins": 0.0002746771788224578, + "rewards/rejected": -0.0010170910973101854, + "step": 10 + }, + { + "epoch": 0.007184259939586905, + "grad_norm": 9.285086199342572, + "learning_rate": 1.0714285714285713e-08, + "logits/chosen": -1.6245068311691284, + "logits/rejected": -1.6487013101577759, + "logps/chosen": -552.9703369140625, + "logps/rejected": -526.51025390625, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0022101732902228832, + "rewards/margins": 0.0029705429915338755, + "rewards/rejected": -0.0007603692938573658, + "step": 11 + }, + { + "epoch": 0.007837374479549351, + "grad_norm": 6.699656683031416, + "learning_rate": 1.1688311688311687e-08, + "logits/chosen": -1.7146100997924805, + "logits/rejected": -1.7387487888336182, + "logps/chosen": -512.6212158203125, + "logps/rejected": -510.3314514160156, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001821169862523675, + "rewards/margins": 0.0011536835227161646, + "rewards/rejected": -0.0029748533852398396, + "step": 12 + }, + { + "epoch": 0.008490489019511797, + "grad_norm": 7.205708657995355, + "learning_rate": 1.2662337662337662e-08, + "logits/chosen": -1.7469621896743774, + "logits/rejected": -1.7776381969451904, + "logps/chosen": -585.2794799804688, + "logps/rejected": -501.048828125, + "loss": 0.6933, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0012821245472878218, + "rewards/margins": 0.001307644764892757, + "rewards/rejected": -2.552039222791791e-05, + "step": 13 + }, + { + "epoch": 0.009143603559474243, + "grad_norm": 17.744991773610803, + "learning_rate": 1.3636363636363636e-08, + "logits/chosen": -1.704886555671692, + "logits/rejected": -1.7239997386932373, + "logps/chosen": -522.1668090820312, + "logps/rejected": -519.49951171875, + "loss": 0.6931, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.00018573057604953647, + "rewards/margins": 0.0024611069820821285, + "rewards/rejected": -0.0022753761149942875, + "step": 14 + }, + { + "epoch": 0.00979671809943669, + "grad_norm": 6.553103071836206, + "learning_rate": 1.461038961038961e-08, + "logits/chosen": -1.715399980545044, + "logits/rejected": -1.724109411239624, + "logps/chosen": -551.634765625, + "logps/rejected": -474.11785888671875, + "loss": 0.6937, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.001085147843696177, + "rewards/margins": 0.0005957795074209571, + "rewards/rejected": 0.0004893685108982027, + "step": 15 + }, + { + "epoch": 0.010449832639399135, + "grad_norm": 10.474583885683455, + "learning_rate": 1.5584415584415586e-08, + "logits/chosen": -1.791701078414917, + "logits/rejected": -1.7533316612243652, + "logps/chosen": -495.9596252441406, + "logps/rejected": -524.2186279296875, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0018585537327453494, + "rewards/margins": -0.001228995155543089, + "rewards/rejected": -0.0006295585772022605, + "step": 16 + }, + { + "epoch": 0.011102947179361581, + "grad_norm": 8.585521664394983, + "learning_rate": 1.6558441558441556e-08, + "logits/chosen": -1.689960241317749, + "logits/rejected": -1.660653829574585, + "logps/chosen": -493.1070251464844, + "logps/rejected": -558.66748046875, + "loss": 0.6931, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0004895019810646772, + "rewards/margins": 0.0019447661470621824, + "rewards/rejected": -0.0014552639331668615, + "step": 17 + }, + { + "epoch": 0.011756061719324026, + "grad_norm": 7.688284862178414, + "learning_rate": 1.753246753246753e-08, + "logits/chosen": -1.634385108947754, + "logits/rejected": -1.6527695655822754, + "logps/chosen": -476.8695068359375, + "logps/rejected": -481.3186950683594, + "loss": 0.6935, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0008362793596461415, + "rewards/margins": -0.0002491784398443997, + "rewards/rejected": 0.0010854577412828803, + "step": 18 + }, + { + "epoch": 0.012409176259286472, + "grad_norm": 11.300638272495242, + "learning_rate": 1.8506493506493504e-08, + "logits/chosen": -1.7336674928665161, + "logits/rejected": -1.7424148321151733, + "logps/chosen": -555.4613037109375, + "logps/rejected": -531.6415405273438, + "loss": 0.693, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0010435438016429543, + "rewards/margins": -0.0002537918044254184, + "rewards/rejected": 0.001297335489653051, + "step": 19 + }, + { + "epoch": 0.013062290799248918, + "grad_norm": 7.209810609202922, + "learning_rate": 1.9480519480519478e-08, + "logits/chosen": -1.6538033485412598, + "logits/rejected": -1.676405668258667, + "logps/chosen": -535.2705688476562, + "logps/rejected": -540.2841186523438, + "loss": 0.6926, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.000612707226537168, + "rewards/margins": -0.0002046869631158188, + "rewards/rejected": -0.0004080201033502817, + "step": 20 + }, + { + "epoch": 0.013715405339211364, + "grad_norm": 9.239564322245995, + "learning_rate": 2.0454545454545452e-08, + "logits/chosen": -1.6935447454452515, + "logits/rejected": -1.7362101078033447, + "logps/chosen": -483.9548034667969, + "logps/rejected": -443.07958984375, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00011293424176983535, + "rewards/margins": 0.0016996811609715223, + "rewards/rejected": -0.0018126153154298663, + "step": 21 + }, + { + "epoch": 0.01436851987917381, + "grad_norm": 11.953662843213952, + "learning_rate": 2.1428571428571426e-08, + "logits/chosen": -1.649803638458252, + "logits/rejected": -1.6609901189804077, + "logps/chosen": -556.7703247070312, + "logps/rejected": -528.4605102539062, + "loss": 0.6941, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0003633833257481456, + "rewards/margins": -0.001797966891899705, + "rewards/rejected": 0.0021613501012325287, + "step": 22 + }, + { + "epoch": 0.015021634419136256, + "grad_norm": 5.585424980323363, + "learning_rate": 2.24025974025974e-08, + "logits/chosen": -1.7381466627120972, + "logits/rejected": -1.7497470378875732, + "logps/chosen": -470.3625183105469, + "logps/rejected": -458.3432312011719, + "loss": 0.6932, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0006353663047775626, + "rewards/margins": -0.0016739725833758712, + "rewards/rejected": 0.002309338888153434, + "step": 23 + }, + { + "epoch": 0.015674748959098702, + "grad_norm": 12.282080216373183, + "learning_rate": 2.3376623376623374e-08, + "logits/chosen": -1.6787421703338623, + "logits/rejected": -1.6916463375091553, + "logps/chosen": -500.0643615722656, + "logps/rejected": -530.3701171875, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00024698246852494776, + "rewards/margins": 0.0018526006024330854, + "rewards/rejected": -0.0016056179301813245, + "step": 24 + }, + { + "epoch": 0.016327863499061148, + "grad_norm": 6.151170677406452, + "learning_rate": 2.435064935064935e-08, + "logits/chosen": -1.730494499206543, + "logits/rejected": -1.7640714645385742, + "logps/chosen": -567.3126831054688, + "logps/rejected": -524.96630859375, + "loss": 0.694, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.002176732989028096, + "rewards/margins": -0.0018334074411541224, + "rewards/rejected": 0.004010140895843506, + "step": 25 + }, + { + "epoch": 0.016980978039023594, + "grad_norm": 6.027535384301359, + "learning_rate": 2.5324675324675325e-08, + "logits/chosen": -1.700589895248413, + "logits/rejected": -1.655698537826538, + "logps/chosen": -525.3576049804688, + "logps/rejected": -536.69384765625, + "loss": 0.6926, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.002966861240565777, + "rewards/margins": -0.00015424739103764296, + "rewards/rejected": 0.0031211089808493853, + "step": 26 + }, + { + "epoch": 0.01763409257898604, + "grad_norm": 10.92870471034886, + "learning_rate": 2.62987012987013e-08, + "logits/chosen": -1.7824229001998901, + "logits/rejected": -1.7060723304748535, + "logps/chosen": -499.6011962890625, + "logps/rejected": -459.31390380859375, + "loss": 0.6928, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0014267514925450087, + "rewards/margins": -0.000579934217967093, + "rewards/rejected": -0.0008468173909932375, + "step": 27 + }, + { + "epoch": 0.018287207118948486, + "grad_norm": 10.356153154714054, + "learning_rate": 2.7272727272727272e-08, + "logits/chosen": -1.7105991840362549, + "logits/rejected": -1.704366683959961, + "logps/chosen": -532.1188354492188, + "logps/rejected": -546.0477905273438, + "loss": 0.6924, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0009431360522285104, + "rewards/margins": 0.0037088487297296524, + "rewards/rejected": -0.00276571256108582, + "step": 28 + }, + { + "epoch": 0.018940321658910932, + "grad_norm": 6.73381910894591, + "learning_rate": 2.8246753246753246e-08, + "logits/chosen": -1.651279330253601, + "logits/rejected": -1.6885946989059448, + "logps/chosen": -498.8572998046875, + "logps/rejected": -454.43701171875, + "loss": 0.6935, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0029693078249692917, + "rewards/margins": 0.00039883877616375685, + "rewards/rejected": 0.0025704693980515003, + "step": 29 + }, + { + "epoch": 0.01959343619887338, + "grad_norm": 9.815644247246649, + "learning_rate": 2.922077922077922e-08, + "logits/chosen": -1.659732699394226, + "logits/rejected": -1.6510050296783447, + "logps/chosen": -527.5221557617188, + "logps/rejected": -572.2091064453125, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00042819028021767735, + "rewards/margins": 0.0010157201904803514, + "rewards/rejected": -0.0014439105289056897, + "step": 30 + }, + { + "epoch": 0.020246550738835824, + "grad_norm": 6.574072733494995, + "learning_rate": 3.01948051948052e-08, + "logits/chosen": -1.7301993370056152, + "logits/rejected": -1.7147951126098633, + "logps/chosen": -495.9303894042969, + "logps/rejected": -487.27630615234375, + "loss": 0.6937, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001899483148008585, + "rewards/margins": 0.0021509858779609203, + "rewards/rejected": -0.00025150307919830084, + "step": 31 + }, + { + "epoch": 0.02089966527879827, + "grad_norm": 6.336712001931794, + "learning_rate": 3.116883116883117e-08, + "logits/chosen": -1.769822597503662, + "logits/rejected": -1.7909101247787476, + "logps/chosen": -466.05523681640625, + "logps/rejected": -443.83001708984375, + "loss": 0.6939, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.002961869351565838, + "rewards/margins": -0.004024825058877468, + "rewards/rejected": 0.0010629557073116302, + "step": 32 + }, + { + "epoch": 0.021552779818760717, + "grad_norm": 7.984443377717489, + "learning_rate": 3.214285714285714e-08, + "logits/chosen": -1.6552331447601318, + "logits/rejected": -1.6343588829040527, + "logps/chosen": -473.5636901855469, + "logps/rejected": -490.8223876953125, + "loss": 0.6935, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0031329344492405653, + "rewards/margins": 0.0010508847190067172, + "rewards/rejected": -0.004183819051831961, + "step": 33 + }, + { + "epoch": 0.022205894358723163, + "grad_norm": 6.978522846761633, + "learning_rate": 3.311688311688311e-08, + "logits/chosen": -1.685523271560669, + "logits/rejected": -1.6947942972183228, + "logps/chosen": -488.18597412109375, + "logps/rejected": -450.8877258300781, + "loss": 0.6926, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0016193913761526346, + "rewards/margins": 0.001421906752511859, + "rewards/rejected": 0.00019748439081013203, + "step": 34 + }, + { + "epoch": 0.02285900889868561, + "grad_norm": 7.493471887712862, + "learning_rate": 3.4090909090909086e-08, + "logits/chosen": -1.711535096168518, + "logits/rejected": -1.7027740478515625, + "logps/chosen": -518.72314453125, + "logps/rejected": -475.7657165527344, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0004046607355121523, + "rewards/margins": 0.001060183160007, + "rewards/rejected": -0.0006555224535986781, + "step": 35 + }, + { + "epoch": 0.02351212343864805, + "grad_norm": 11.205930629830023, + "learning_rate": 3.506493506493506e-08, + "logits/chosen": -1.7367273569107056, + "logits/rejected": -1.762798547744751, + "logps/chosen": -453.77130126953125, + "logps/rejected": -481.04730224609375, + "loss": 0.6929, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00015169614925980568, + "rewards/margins": 0.0007772659882903099, + "rewards/rejected": -0.0009289622539654374, + "step": 36 + }, + { + "epoch": 0.024165237978610497, + "grad_norm": 9.467273081236305, + "learning_rate": 3.6038961038961034e-08, + "logits/chosen": -1.7246737480163574, + "logits/rejected": -1.7480111122131348, + "logps/chosen": -466.54998779296875, + "logps/rejected": -427.57373046875, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0015318728983402252, + "rewards/margins": 0.0017306231893599033, + "rewards/rejected": -0.00019875055295415223, + "step": 37 + }, + { + "epoch": 0.024818352518572943, + "grad_norm": 9.156288457593233, + "learning_rate": 3.701298701298701e-08, + "logits/chosen": -1.6512861251831055, + "logits/rejected": -1.690797209739685, + "logps/chosen": -509.2376708984375, + "logps/rejected": -528.0012817382812, + "loss": 0.6931, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0029702354222536087, + "rewards/margins": 0.0016618178924545646, + "rewards/rejected": 0.0013084171805530787, + "step": 38 + }, + { + "epoch": 0.02547146705853539, + "grad_norm": 17.95636045906593, + "learning_rate": 3.798701298701298e-08, + "logits/chosen": -1.6484848260879517, + "logits/rejected": -1.6626923084259033, + "logps/chosen": -444.7795715332031, + "logps/rejected": -453.7108154296875, + "loss": 0.6934, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.000330042967107147, + "rewards/margins": 0.0012718366924673319, + "rewards/rejected": -0.001601879601366818, + "step": 39 + }, + { + "epoch": 0.026124581598497836, + "grad_norm": 8.845607047635447, + "learning_rate": 3.8961038961038956e-08, + "logits/chosen": -1.6774462461471558, + "logits/rejected": -1.6941050291061401, + "logps/chosen": -490.5712890625, + "logps/rejected": -565.9147338867188, + "loss": 0.6937, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0004610727773979306, + "rewards/margins": -0.0016047237440943718, + "rewards/rejected": 0.002065796870738268, + "step": 40 + }, + { + "epoch": 0.02677769613846028, + "grad_norm": 9.919258253862095, + "learning_rate": 3.993506493506493e-08, + "logits/chosen": -1.6783130168914795, + "logits/rejected": -1.6832352876663208, + "logps/chosen": -424.7172546386719, + "logps/rejected": -463.73919677734375, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0029250048100948334, + "rewards/margins": 0.00039243214996531606, + "rewards/rejected": 0.0025325727183371782, + "step": 41 + }, + { + "epoch": 0.027430810678422728, + "grad_norm": 7.033725232378199, + "learning_rate": 4.0909090909090904e-08, + "logits/chosen": -1.7260679006576538, + "logits/rejected": -1.7201330661773682, + "logps/chosen": -540.1764526367188, + "logps/rejected": -501.5211486816406, + "loss": 0.6926, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.3270258679986e-06, + "rewards/margins": 0.0007927274564281106, + "rewards/rejected": -0.0007904003723524511, + "step": 42 + }, + { + "epoch": 0.028083925218385174, + "grad_norm": 8.639855374503854, + "learning_rate": 4.188311688311688e-08, + "logits/chosen": -1.7189686298370361, + "logits/rejected": -1.7004797458648682, + "logps/chosen": -505.0670471191406, + "logps/rejected": -522.9998168945312, + "loss": 0.6933, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0024530505761504173, + "rewards/margins": -0.0023120976984500885, + "rewards/rejected": 0.004765148274600506, + "step": 43 + }, + { + "epoch": 0.02873703975834762, + "grad_norm": 7.855142409694213, + "learning_rate": 4.285714285714285e-08, + "logits/chosen": -1.8125633001327515, + "logits/rejected": -1.8358653783798218, + "logps/chosen": -449.2396545410156, + "logps/rejected": -412.31622314453125, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.799867307767272e-05, + "rewards/margins": 0.0013592124450951815, + "rewards/rejected": -0.0014372109435498714, + "step": 44 + }, + { + "epoch": 0.029390154298310066, + "grad_norm": 14.0615352021285, + "learning_rate": 4.3831168831168825e-08, + "logits/chosen": -1.6212176084518433, + "logits/rejected": -1.7029989957809448, + "logps/chosen": -485.98822021484375, + "logps/rejected": -451.6901550292969, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002106280066072941, + "rewards/margins": -0.00013485189992934465, + "rewards/rejected": 0.0022411320824176073, + "step": 45 + }, + { + "epoch": 0.030043268838272512, + "grad_norm": 13.882948188534005, + "learning_rate": 4.48051948051948e-08, + "logits/chosen": -1.7054741382598877, + "logits/rejected": -1.6707385778427124, + "logps/chosen": -523.1839599609375, + "logps/rejected": -538.1018676757812, + "loss": 0.6925, + "rewards/accuracies": 0.4375, + "rewards/chosen": 6.611342541873455e-05, + "rewards/margins": -0.0016670847544446588, + "rewards/rejected": 0.0017331981798633933, + "step": 46 + }, + { + "epoch": 0.030696383378234958, + "grad_norm": 9.843375032358303, + "learning_rate": 4.577922077922077e-08, + "logits/chosen": -1.7931544780731201, + "logits/rejected": -1.8134924173355103, + "logps/chosen": -576.3263549804688, + "logps/rejected": -531.6103515625, + "loss": 0.6925, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.00019430162501521409, + "rewards/margins": -0.0015074944822117686, + "rewards/rejected": 0.0013131927698850632, + "step": 47 + }, + { + "epoch": 0.031349497918197404, + "grad_norm": 13.08440244244893, + "learning_rate": 4.675324675324675e-08, + "logits/chosen": -1.728432536125183, + "logits/rejected": -1.687523365020752, + "logps/chosen": -424.1823425292969, + "logps/rejected": -436.7662353515625, + "loss": 0.6932, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0026053618639707565, + "rewards/margins": 0.0011907146545127034, + "rewards/rejected": 0.0014146470930427313, + "step": 48 + }, + { + "epoch": 0.03200261245815985, + "grad_norm": 9.954241633042399, + "learning_rate": 4.772727272727273e-08, + "logits/chosen": -1.7366923093795776, + "logits/rejected": -1.7875515222549438, + "logps/chosen": -482.4845886230469, + "logps/rejected": -462.21734619140625, + "loss": 0.6931, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.002501377835869789, + "rewards/margins": 0.0017617797711864114, + "rewards/rejected": 0.0007395982975140214, + "step": 49 + }, + { + "epoch": 0.032655726998122296, + "grad_norm": 7.516649010433926, + "learning_rate": 4.87012987012987e-08, + "logits/chosen": -1.643943190574646, + "logits/rejected": -1.6239250898361206, + "logps/chosen": -571.3704223632812, + "logps/rejected": -552.835205078125, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005948520265519619, + "rewards/margins": -0.001149489893577993, + "rewards/rejected": 0.0017443416872993112, + "step": 50 + }, + { + "epoch": 0.03330884153808474, + "grad_norm": 8.673266186430727, + "learning_rate": 4.9675324675324675e-08, + "logits/chosen": -1.654160499572754, + "logits/rejected": -1.6642738580703735, + "logps/chosen": -568.924072265625, + "logps/rejected": -502.4623107910156, + "loss": 0.6928, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.003631243482232094, + "rewards/margins": 0.002229990903288126, + "rewards/rejected": 0.0014012528117746115, + "step": 51 + }, + { + "epoch": 0.03396195607804719, + "grad_norm": 5.760323905203282, + "learning_rate": 5.064935064935065e-08, + "logits/chosen": -1.7572216987609863, + "logits/rejected": -1.746352195739746, + "logps/chosen": -500.9621276855469, + "logps/rejected": -516.9683837890625, + "loss": 0.6923, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0025606201961636543, + "rewards/margins": 0.0012761401012539864, + "rewards/rejected": 0.0012844798620790243, + "step": 52 + }, + { + "epoch": 0.034615070618009634, + "grad_norm": 13.745858132264178, + "learning_rate": 5.162337662337662e-08, + "logits/chosen": -1.6922714710235596, + "logits/rejected": -1.6829943656921387, + "logps/chosen": -521.5164794921875, + "logps/rejected": -511.12115478515625, + "loss": 0.6929, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0012258647475391626, + "rewards/margins": 0.0017541146371513605, + "rewards/rejected": -0.0005282496567815542, + "step": 53 + }, + { + "epoch": 0.03526818515797208, + "grad_norm": 9.901929656057623, + "learning_rate": 5.25974025974026e-08, + "logits/chosen": -1.6156866550445557, + "logits/rejected": -1.6489068269729614, + "logps/chosen": -504.8508605957031, + "logps/rejected": -510.78070068359375, + "loss": 0.6923, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0022437286097556353, + "rewards/margins": 0.0005393887404352427, + "rewards/rejected": 0.0017043398693203926, + "step": 54 + }, + { + "epoch": 0.035921299697934526, + "grad_norm": 7.628193867040592, + "learning_rate": 5.357142857142857e-08, + "logits/chosen": -1.7694371938705444, + "logits/rejected": -1.7641582489013672, + "logps/chosen": -497.0212707519531, + "logps/rejected": -577.7604370117188, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00022474752040579915, + "rewards/margins": 0.0001437664614059031, + "rewards/rejected": 8.098152466118336e-05, + "step": 55 + }, + { + "epoch": 0.03657441423789697, + "grad_norm": 6.526308952187627, + "learning_rate": 5.4545454545454545e-08, + "logits/chosen": -1.6863549947738647, + "logits/rejected": -1.726689100265503, + "logps/chosen": -570.279052734375, + "logps/rejected": -529.3751831054688, + "loss": 0.693, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0028358842246234417, + "rewards/margins": -0.0016305134631693363, + "rewards/rejected": -0.00120537041220814, + "step": 56 + }, + { + "epoch": 0.03722752877785942, + "grad_norm": 7.971501408541291, + "learning_rate": 5.551948051948052e-08, + "logits/chosen": -1.7459444999694824, + "logits/rejected": -1.8165135383605957, + "logps/chosen": -500.9310302734375, + "logps/rejected": -476.9623107910156, + "loss": 0.6933, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.002534267958253622, + "rewards/margins": -0.00011749513214454055, + "rewards/rejected": -0.0024167723022401333, + "step": 57 + }, + { + "epoch": 0.037880643317821865, + "grad_norm": 19.843826487104433, + "learning_rate": 5.649350649350649e-08, + "logits/chosen": -1.6915953159332275, + "logits/rejected": -1.7043017148971558, + "logps/chosen": -437.893310546875, + "logps/rejected": -444.86895751953125, + "loss": 0.6934, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0005522965220734477, + "rewards/margins": -0.0005562017904594541, + "rewards/rejected": 3.905253834091127e-06, + "step": 58 + }, + { + "epoch": 0.03853375785778431, + "grad_norm": 17.45299457584043, + "learning_rate": 5.7467532467532466e-08, + "logits/chosen": -1.7828876972198486, + "logits/rejected": -1.7247426509857178, + "logps/chosen": -489.2809753417969, + "logps/rejected": -575.0067138671875, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004004344809800386, + "rewards/margins": -0.001498160301707685, + "rewards/rejected": -0.0025061843916773796, + "step": 59 + }, + { + "epoch": 0.03918687239774676, + "grad_norm": 7.706775067267686, + "learning_rate": 5.844155844155844e-08, + "logits/chosen": -1.7627689838409424, + "logits/rejected": -1.7395163774490356, + "logps/chosen": -543.8458251953125, + "logps/rejected": -535.2295532226562, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002326030982658267, + "rewards/margins": 0.0011458850931376219, + "rewards/rejected": 0.0011801458895206451, + "step": 60 + }, + { + "epoch": 0.0398399869377092, + "grad_norm": 6.253034693514966, + "learning_rate": 5.9415584415584414e-08, + "logits/chosen": -1.6752774715423584, + "logits/rejected": -1.6769983768463135, + "logps/chosen": -513.5790405273438, + "logps/rejected": -491.4671630859375, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": 4.048121627420187e-05, + "rewards/margins": 0.00097788090351969, + "rewards/rejected": -0.0009373998618684709, + "step": 61 + }, + { + "epoch": 0.04049310147767165, + "grad_norm": 8.25357941061994, + "learning_rate": 6.03896103896104e-08, + "logits/chosen": -1.7725433111190796, + "logits/rejected": -1.8016797304153442, + "logps/chosen": -504.8186950683594, + "logps/rejected": -534.6757202148438, + "loss": 0.6929, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0038966606371104717, + "rewards/margins": 0.0019743391312658787, + "rewards/rejected": 0.0019223212730139494, + "step": 62 + }, + { + "epoch": 0.041146216017634095, + "grad_norm": 7.603828708440927, + "learning_rate": 6.136363636363636e-08, + "logits/chosen": -1.784754991531372, + "logits/rejected": -1.7742725610733032, + "logps/chosen": -547.2255249023438, + "logps/rejected": -546.972412109375, + "loss": 0.6936, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.002894229721277952, + "rewards/margins": -0.0006492708926089108, + "rewards/rejected": 0.0035435007885098457, + "step": 63 + }, + { + "epoch": 0.04179933055759654, + "grad_norm": 15.969939708450756, + "learning_rate": 6.233766233766234e-08, + "logits/chosen": -1.654356837272644, + "logits/rejected": -1.6166218519210815, + "logps/chosen": -488.8111877441406, + "logps/rejected": -569.5529174804688, + "loss": 0.692, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.003502319101244211, + "rewards/margins": 0.005036606453359127, + "rewards/rejected": -0.0015342880506068468, + "step": 64 + }, + { + "epoch": 0.04245244509755899, + "grad_norm": 22.983780138981498, + "learning_rate": 6.331168831168831e-08, + "logits/chosen": -1.7115808725357056, + "logits/rejected": -1.7262012958526611, + "logps/chosen": -499.15399169921875, + "logps/rejected": -488.51214599609375, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0016056394670158625, + "rewards/margins": 0.0008727168897166848, + "rewards/rejected": 0.0007329225772991776, + "step": 65 + }, + { + "epoch": 0.04310555963752143, + "grad_norm": 11.044651160280129, + "learning_rate": 6.428571428571428e-08, + "logits/chosen": -1.795534610748291, + "logits/rejected": -1.799638271331787, + "logps/chosen": -533.3612670898438, + "logps/rejected": -621.8697509765625, + "loss": 0.6923, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.001967411022633314, + "rewards/margins": 0.003664374118670821, + "rewards/rejected": -0.001696963096037507, + "step": 66 + }, + { + "epoch": 0.04375867417748388, + "grad_norm": 25.764603545634195, + "learning_rate": 6.525974025974026e-08, + "logits/chosen": -1.7212399244308472, + "logits/rejected": -1.6900732517242432, + "logps/chosen": -433.75335693359375, + "logps/rejected": -483.9370422363281, + "loss": 0.6929, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.005031700246036053, + "rewards/margins": 0.00277211656793952, + "rewards/rejected": 0.002259582979604602, + "step": 67 + }, + { + "epoch": 0.044411788717446325, + "grad_norm": 7.6456802404761985, + "learning_rate": 6.623376623376622e-08, + "logits/chosen": -1.8252774477005005, + "logits/rejected": -1.7665594816207886, + "logps/chosen": -505.1371765136719, + "logps/rejected": -519.2654418945312, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": -7.071479922160506e-05, + "rewards/margins": 0.0002764108357951045, + "rewards/rejected": -0.00034712543128989637, + "step": 68 + }, + { + "epoch": 0.04506490325740877, + "grad_norm": 22.249196941026028, + "learning_rate": 6.72077922077922e-08, + "logits/chosen": -1.7685558795928955, + "logits/rejected": -1.754807710647583, + "logps/chosen": -498.36309814453125, + "logps/rejected": -479.595947265625, + "loss": 0.6927, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0044998289085924625, + "rewards/margins": 0.0033680484630167484, + "rewards/rejected": 0.0011317802127450705, + "step": 69 + }, + { + "epoch": 0.04571801779737122, + "grad_norm": 18.41888656875299, + "learning_rate": 6.818181818181817e-08, + "logits/chosen": -1.6847811937332153, + "logits/rejected": -1.7150285243988037, + "logps/chosen": -521.6116333007812, + "logps/rejected": -501.6766662597656, + "loss": 0.6926, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.00515484344214201, + "rewards/margins": 0.0027162095066159964, + "rewards/rejected": 0.0024386332370340824, + "step": 70 + }, + { + "epoch": 0.046371132337333656, + "grad_norm": 7.5323060443272425, + "learning_rate": 6.915584415584415e-08, + "logits/chosen": -1.6845817565917969, + "logits/rejected": -1.6868298053741455, + "logps/chosen": -509.47174072265625, + "logps/rejected": -497.8253173828125, + "loss": 0.6923, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.006278076209127903, + "rewards/margins": 0.003210592083632946, + "rewards/rejected": 0.0030674838926643133, + "step": 71 + }, + { + "epoch": 0.0470242468772961, + "grad_norm": 11.518462131512745, + "learning_rate": 7.012987012987012e-08, + "logits/chosen": -1.689769983291626, + "logits/rejected": -1.6752269268035889, + "logps/chosen": -508.0777587890625, + "logps/rejected": -497.0141906738281, + "loss": 0.6925, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0023371409624814987, + "rewards/margins": -0.0009353661444038153, + "rewards/rejected": 0.0032725068740546703, + "step": 72 + }, + { + "epoch": 0.04767736141725855, + "grad_norm": 13.464519389398141, + "learning_rate": 7.11038961038961e-08, + "logits/chosen": -1.7256083488464355, + "logits/rejected": -1.7064285278320312, + "logps/chosen": -528.0186157226562, + "logps/rejected": -521.5746459960938, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0029949708841741085, + "rewards/margins": 0.0013601587852463126, + "rewards/rejected": 0.0016348123317584395, + "step": 73 + }, + { + "epoch": 0.048330475957220995, + "grad_norm": 20.376786509212838, + "learning_rate": 7.207792207792207e-08, + "logits/chosen": -1.641348123550415, + "logits/rejected": -1.6459161043167114, + "logps/chosen": -482.44732666015625, + "logps/rejected": -473.39190673828125, + "loss": 0.6926, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.0051408931612968445, + "rewards/margins": -0.0015343116829171777, + "rewards/rejected": 0.006675205193459988, + "step": 74 + }, + { + "epoch": 0.04898359049718344, + "grad_norm": 6.970678593146312, + "learning_rate": 7.305194805194805e-08, + "logits/chosen": -1.7044446468353271, + "logits/rejected": -1.68734610080719, + "logps/chosen": -502.9562072753906, + "logps/rejected": -517.5908813476562, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0027034569066017866, + "rewards/margins": 0.0027098727878183126, + "rewards/rejected": -6.4158812165260315e-06, + "step": 75 + }, + { + "epoch": 0.04963670503714589, + "grad_norm": 9.6526922150323, + "learning_rate": 7.402597402597402e-08, + "logits/chosen": -1.7046608924865723, + "logits/rejected": -1.7354328632354736, + "logps/chosen": -535.493408203125, + "logps/rejected": -594.682861328125, + "loss": 0.6927, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.004995846655219793, + "rewards/margins": -0.0013705159071832895, + "rewards/rejected": -0.0036253309808671474, + "step": 76 + }, + { + "epoch": 0.05028981957710833, + "grad_norm": 7.329104460905438, + "learning_rate": 7.5e-08, + "logits/chosen": -1.7437448501586914, + "logits/rejected": -1.786738395690918, + "logps/chosen": -503.44281005859375, + "logps/rejected": -479.0059814453125, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0025321291759610176, + "rewards/margins": 0.007668951991945505, + "rewards/rejected": -0.005136823281645775, + "step": 77 + }, + { + "epoch": 0.05094293411707078, + "grad_norm": 12.493258401725456, + "learning_rate": 7.597402597402596e-08, + "logits/chosen": -1.7119678258895874, + "logits/rejected": -1.7332830429077148, + "logps/chosen": -488.6138916015625, + "logps/rejected": -485.7553405761719, + "loss": 0.6921, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.004445738159120083, + "rewards/margins": 0.002824036870151758, + "rewards/rejected": 0.001621701754629612, + "step": 78 + }, + { + "epoch": 0.051596048657033225, + "grad_norm": 8.01136858731629, + "learning_rate": 7.694805194805194e-08, + "logits/chosen": -1.7600123882293701, + "logits/rejected": -1.7657560110092163, + "logps/chosen": -527.957763671875, + "logps/rejected": -547.0194702148438, + "loss": 0.6924, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0005399225628934801, + "rewards/margins": 0.0007603027625009418, + "rewards/rejected": -0.00022038002498447895, + "step": 79 + }, + { + "epoch": 0.05224916319699567, + "grad_norm": 15.784986319963485, + "learning_rate": 7.792207792207791e-08, + "logits/chosen": -1.7285094261169434, + "logits/rejected": -1.712064504623413, + "logps/chosen": -521.34814453125, + "logps/rejected": -482.8376159667969, + "loss": 0.6921, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0010725975735113025, + "rewards/margins": -0.0014756344025954604, + "rewards/rejected": 0.00040303694549947977, + "step": 80 + }, + { + "epoch": 0.05290227773695812, + "grad_norm": 8.657892032924215, + "learning_rate": 7.889610389610389e-08, + "logits/chosen": -1.6536437273025513, + "logits/rejected": -1.6625897884368896, + "logps/chosen": -549.460205078125, + "logps/rejected": -534.7042236328125, + "loss": 0.6927, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0003026010235771537, + "rewards/margins": 0.0008853006293065846, + "rewards/rejected": -0.0005827000131830573, + "step": 81 + }, + { + "epoch": 0.05355539227692056, + "grad_norm": 10.870568795609778, + "learning_rate": 7.987012987012986e-08, + "logits/chosen": -1.6692464351654053, + "logits/rejected": -1.6596240997314453, + "logps/chosen": -506.01922607421875, + "logps/rejected": -524.4590454101562, + "loss": 0.6917, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0017397881019860506, + "rewards/margins": 0.005637907888740301, + "rewards/rejected": -0.003898120019584894, + "step": 82 + }, + { + "epoch": 0.05420850681688301, + "grad_norm": 5.851977319869193, + "learning_rate": 8.084415584415584e-08, + "logits/chosen": -1.738106608390808, + "logits/rejected": -1.7303545475006104, + "logps/chosen": -482.5537414550781, + "logps/rejected": -484.3572998046875, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0022105362731963396, + "rewards/margins": 0.001744122477248311, + "rewards/rejected": 0.00046641333028674126, + "step": 83 + }, + { + "epoch": 0.054861621356845455, + "grad_norm": 5.9816373124178375, + "learning_rate": 8.181818181818181e-08, + "logits/chosen": -1.7219839096069336, + "logits/rejected": -1.7243647575378418, + "logps/chosen": -531.4603881835938, + "logps/rejected": -624.1871337890625, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001187248039059341, + "rewards/margins": 0.008130445145070553, + "rewards/rejected": -0.009317693300545216, + "step": 84 + }, + { + "epoch": 0.0555147358968079, + "grad_norm": 17.14950100054283, + "learning_rate": 8.279220779220779e-08, + "logits/chosen": -1.7397611141204834, + "logits/rejected": -1.7358020544052124, + "logps/chosen": -475.4699401855469, + "logps/rejected": -454.5489501953125, + "loss": 0.6923, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.004636278375983238, + "rewards/margins": 0.0002992916852235794, + "rewards/rejected": 0.004336986690759659, + "step": 85 + }, + { + "epoch": 0.05616785043677035, + "grad_norm": 10.798912022369151, + "learning_rate": 8.376623376623376e-08, + "logits/chosen": -1.6888103485107422, + "logits/rejected": -1.6987148523330688, + "logps/chosen": -572.60400390625, + "logps/rejected": -599.0338745117188, + "loss": 0.691, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0021805190481245518, + "rewards/margins": 0.006904205307364464, + "rewards/rejected": -0.004723686724901199, + "step": 86 + }, + { + "epoch": 0.05682096497673279, + "grad_norm": 5.848424010532979, + "learning_rate": 8.474025974025974e-08, + "logits/chosen": -1.7512600421905518, + "logits/rejected": -1.77037513256073, + "logps/chosen": -575.0525512695312, + "logps/rejected": -551.678955078125, + "loss": 0.692, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.000223713053856045, + "rewards/margins": 6.933440454304218e-05, + "rewards/rejected": 0.00015437835827469826, + "step": 87 + }, + { + "epoch": 0.05747407951669524, + "grad_norm": 9.540765629514766, + "learning_rate": 8.57142857142857e-08, + "logits/chosen": -1.6761497259140015, + "logits/rejected": -1.6493499279022217, + "logps/chosen": -565.0869140625, + "logps/rejected": -575.0073852539062, + "loss": 0.6916, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.001043493626639247, + "rewards/margins": 0.004106073174625635, + "rewards/rejected": -0.0051495665684342384, + "step": 88 + }, + { + "epoch": 0.058127194056657686, + "grad_norm": 10.17514292490149, + "learning_rate": 8.668831168831168e-08, + "logits/chosen": -1.8012816905975342, + "logits/rejected": -1.7821853160858154, + "logps/chosen": -503.46295166015625, + "logps/rejected": -492.86474609375, + "loss": 0.6923, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0033976598642766476, + "rewards/margins": -0.0017642759485170245, + "rewards/rejected": 0.005161936394870281, + "step": 89 + }, + { + "epoch": 0.05878030859662013, + "grad_norm": 9.087384783413304, + "learning_rate": 8.766233766233765e-08, + "logits/chosen": -1.6532089710235596, + "logits/rejected": -1.657583475112915, + "logps/chosen": -515.2310180664062, + "logps/rejected": -509.0292053222656, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0025243638083338737, + "rewards/margins": 0.0019023300847038627, + "rewards/rejected": 0.0006220341892912984, + "step": 90 + }, + { + "epoch": 0.05943342313658258, + "grad_norm": 11.600432401053613, + "learning_rate": 8.863636363636363e-08, + "logits/chosen": -1.7180678844451904, + "logits/rejected": -1.702775478363037, + "logps/chosen": -534.7321166992188, + "logps/rejected": -540.93994140625, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004910244606435299, + "rewards/margins": 0.003610987449064851, + "rewards/rejected": 0.0012992569245398045, + "step": 91 + }, + { + "epoch": 0.060086537676545024, + "grad_norm": 19.03876328807656, + "learning_rate": 8.96103896103896e-08, + "logits/chosen": -1.7077628374099731, + "logits/rejected": -1.7351174354553223, + "logps/chosen": -529.73681640625, + "logps/rejected": -508.55828857421875, + "loss": 0.6917, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.003127522300928831, + "rewards/margins": 0.00047443623770959675, + "rewards/rejected": -0.0036019585095345974, + "step": 92 + }, + { + "epoch": 0.06073965221650747, + "grad_norm": 16.589555191285804, + "learning_rate": 9.058441558441558e-08, + "logits/chosen": -1.6833391189575195, + "logits/rejected": -1.674033284187317, + "logps/chosen": -477.49798583984375, + "logps/rejected": -464.76336669921875, + "loss": 0.6915, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.001920571201480925, + "rewards/margins": 0.0017885612323880196, + "rewards/rejected": 0.00013200979446992278, + "step": 93 + }, + { + "epoch": 0.061392766756469916, + "grad_norm": 21.483285457055274, + "learning_rate": 9.155844155844155e-08, + "logits/chosen": -1.6899855136871338, + "logits/rejected": -1.7081849575042725, + "logps/chosen": -472.30670166015625, + "logps/rejected": -473.02532958984375, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0001794814015738666, + "rewards/margins": 0.0037227103020995855, + "rewards/rejected": -0.0039021919947117567, + "step": 94 + }, + { + "epoch": 0.06204588129643236, + "grad_norm": 8.584522992242459, + "learning_rate": 9.253246753246754e-08, + "logits/chosen": -1.6826878786087036, + "logits/rejected": -1.6474887132644653, + "logps/chosen": -469.18511962890625, + "logps/rejected": -475.0853576660156, + "loss": 0.6914, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0038090678863227367, + "rewards/margins": 0.005096444860100746, + "rewards/rejected": -0.00890551321208477, + "step": 95 + }, + { + "epoch": 0.06269899583639481, + "grad_norm": 10.611257258997737, + "learning_rate": 9.35064935064935e-08, + "logits/chosen": -1.7167426347732544, + "logits/rejected": -1.745957374572754, + "logps/chosen": -415.5352478027344, + "logps/rejected": -406.41021728515625, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -8.110757335089147e-05, + "rewards/margins": 0.005273091606795788, + "rewards/rejected": -0.005354199092835188, + "step": 96 + }, + { + "epoch": 0.06335211037635725, + "grad_norm": 17.637332082165788, + "learning_rate": 9.448051948051949e-08, + "logits/chosen": -1.7745977640151978, + "logits/rejected": -1.7798304557800293, + "logps/chosen": -471.1458435058594, + "logps/rejected": -464.43621826171875, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0014066218864172697, + "rewards/margins": 0.002926683286204934, + "rewards/rejected": -0.004333305172622204, + "step": 97 + }, + { + "epoch": 0.0640052249163197, + "grad_norm": 15.475362634607388, + "learning_rate": 9.545454545454546e-08, + "logits/chosen": -1.757310390472412, + "logits/rejected": -1.7441529035568237, + "logps/chosen": -543.7720336914062, + "logps/rejected": -521.256591796875, + "loss": 0.6905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0013191296020522714, + "rewards/margins": 0.006374385207891464, + "rewards/rejected": -0.005055255722254515, + "step": 98 + }, + { + "epoch": 0.06465833945628215, + "grad_norm": 8.90308809277182, + "learning_rate": 9.642857142857144e-08, + "logits/chosen": -1.7599557638168335, + "logits/rejected": -1.7538748979568481, + "logps/chosen": -488.3480224609375, + "logps/rejected": -493.222900390625, + "loss": 0.6907, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.005196585785597563, + "rewards/margins": 0.00629068398848176, + "rewards/rejected": -0.0010940982028841972, + "step": 99 + }, + { + "epoch": 0.06531145399624459, + "grad_norm": 8.599502814641067, + "learning_rate": 9.74025974025974e-08, + "logits/chosen": -1.6944297552108765, + "logits/rejected": -1.6814221143722534, + "logps/chosen": -491.616943359375, + "logps/rejected": -464.1621398925781, + "loss": 0.6911, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0056236740201711655, + "rewards/margins": 0.0013095784233883023, + "rewards/rejected": -0.006933252792805433, + "step": 100 + }, + { + "epoch": 0.06531145399624459, + "eval_logits/chosen": -1.7781137228012085, + "eval_logits/rejected": -1.783394694328308, + "eval_logps/chosen": -510.6041564941406, + "eval_logps/rejected": -502.9036865234375, + "eval_loss": 0.6912173628807068, + "eval_rewards/accuracies": 0.5640000104904175, + "eval_rewards/chosen": -0.0025881431065499783, + "eval_rewards/margins": 0.004054033197462559, + "eval_rewards/rejected": -0.006642176769673824, + "eval_runtime": 309.9462, + "eval_samples_per_second": 12.905, + "eval_steps_per_second": 0.807, + "step": 100 + }, + { + "epoch": 0.06596456853620704, + "grad_norm": 11.975392598780218, + "learning_rate": 9.837662337662338e-08, + "logits/chosen": -1.8105459213256836, + "logits/rejected": -1.7418413162231445, + "logps/chosen": -495.683837890625, + "logps/rejected": -558.681884765625, + "loss": 0.6916, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.013931737281382084, + "rewards/margins": 0.0046969749964773655, + "rewards/rejected": -0.018628710880875587, + "step": 101 + }, + { + "epoch": 0.06661768307616948, + "grad_norm": 19.08100215691415, + "learning_rate": 9.935064935064935e-08, + "logits/chosen": -1.7734951972961426, + "logits/rejected": -1.785585880279541, + "logps/chosen": -521.8323974609375, + "logps/rejected": -493.7707824707031, + "loss": 0.69, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0010571195743978024, + "rewards/margins": 0.004120311699807644, + "rewards/rejected": -0.0051774317398667336, + "step": 102 + }, + { + "epoch": 0.06727079761613193, + "grad_norm": 16.407227405047756, + "learning_rate": 1.0032467532467532e-07, + "logits/chosen": -1.813089370727539, + "logits/rejected": -1.8418606519699097, + "logps/chosen": -535.99658203125, + "logps/rejected": -547.38134765625, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0009367464226670563, + "rewards/margins": 0.004543146584182978, + "rewards/rejected": -0.005479893181473017, + "step": 103 + }, + { + "epoch": 0.06792391215609438, + "grad_norm": 10.784387451558391, + "learning_rate": 1.012987012987013e-07, + "logits/chosen": -1.6748157739639282, + "logits/rejected": -1.6749082803726196, + "logps/chosen": -544.4282836914062, + "logps/rejected": -502.47491455078125, + "loss": 0.6909, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.005649018567055464, + "rewards/margins": 0.002750987885519862, + "rewards/rejected": -0.008400006219744682, + "step": 104 + }, + { + "epoch": 0.06857702669605682, + "grad_norm": 7.366213696928153, + "learning_rate": 1.0227272727272727e-07, + "logits/chosen": -1.8434463739395142, + "logits/rejected": -1.8662176132202148, + "logps/chosen": -499.53240966796875, + "logps/rejected": -481.79345703125, + "loss": 0.6913, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008984744548797607, + "rewards/margins": 0.001937179360538721, + "rewards/rejected": -0.010921923443675041, + "step": 105 + }, + { + "epoch": 0.06923014123601927, + "grad_norm": 8.12342504564889, + "learning_rate": 1.0324675324675325e-07, + "logits/chosen": -1.7362070083618164, + "logits/rejected": -1.7785553932189941, + "logps/chosen": -524.1248779296875, + "logps/rejected": -506.4564514160156, + "loss": 0.6903, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.001606869394890964, + "rewards/margins": 0.002641630358994007, + "rewards/rejected": -0.0010347607312723994, + "step": 106 + }, + { + "epoch": 0.06988325577598171, + "grad_norm": 6.5649883506893385, + "learning_rate": 1.0422077922077921e-07, + "logits/chosen": -1.695350170135498, + "logits/rejected": -1.6904462575912476, + "logps/chosen": -556.2769775390625, + "logps/rejected": -519.2373046875, + "loss": 0.6906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.012179381214082241, + "rewards/margins": 0.008277284912765026, + "rewards/rejected": -0.020456667989492416, + "step": 107 + }, + { + "epoch": 0.07053637031594416, + "grad_norm": 13.47303429680314, + "learning_rate": 1.051948051948052e-07, + "logits/chosen": -1.6657894849777222, + "logits/rejected": -1.6693402528762817, + "logps/chosen": -535.307373046875, + "logps/rejected": -526.8478393554688, + "loss": 0.6906, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.006545607931911945, + "rewards/margins": 0.008519239723682404, + "rewards/rejected": -0.015064846724271774, + "step": 108 + }, + { + "epoch": 0.0711894848559066, + "grad_norm": 16.04827613099803, + "learning_rate": 1.0616883116883116e-07, + "logits/chosen": -1.6586048603057861, + "logits/rejected": -1.6781889200210571, + "logps/chosen": -530.2203979492188, + "logps/rejected": -543.2777709960938, + "loss": 0.6887, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0075052459724247456, + "rewards/margins": 0.00435210857540369, + "rewards/rejected": -0.011857354082167149, + "step": 109 + }, + { + "epoch": 0.07184259939586905, + "grad_norm": 6.9887079248570245, + "learning_rate": 1.0714285714285714e-07, + "logits/chosen": -1.719894289970398, + "logits/rejected": -1.73370361328125, + "logps/chosen": -461.96893310546875, + "logps/rejected": -460.13861083984375, + "loss": 0.6903, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.003742053173482418, + "rewards/margins": 0.008006452582776546, + "rewards/rejected": -0.011748505756258965, + "step": 110 + }, + { + "epoch": 0.0724957139358315, + "grad_norm": 17.35796671683516, + "learning_rate": 1.0811688311688311e-07, + "logits/chosen": -1.7226269245147705, + "logits/rejected": -1.70708429813385, + "logps/chosen": -531.7066650390625, + "logps/rejected": -529.4414672851562, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013616499491035938, + "rewards/margins": 0.0025032046250998974, + "rewards/rejected": -0.01611970365047455, + "step": 111 + }, + { + "epoch": 0.07314882847579394, + "grad_norm": 8.237208524897573, + "learning_rate": 1.0909090909090909e-07, + "logits/chosen": -1.6860663890838623, + "logits/rejected": -1.6681212186813354, + "logps/chosen": -412.7210693359375, + "logps/rejected": -408.09210205078125, + "loss": 0.6898, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.010377769358456135, + "rewards/margins": 0.0005328749539330602, + "rewards/rejected": -0.010910644195973873, + "step": 112 + }, + { + "epoch": 0.07380194301575639, + "grad_norm": 14.11440807458263, + "learning_rate": 1.1006493506493506e-07, + "logits/chosen": -1.7322160005569458, + "logits/rejected": -1.7938529253005981, + "logps/chosen": -524.374755859375, + "logps/rejected": -477.0540771484375, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010047688148915768, + "rewards/margins": 0.0021818396635353565, + "rewards/rejected": -0.012229528278112411, + "step": 113 + }, + { + "epoch": 0.07445505755571884, + "grad_norm": 19.41135088594217, + "learning_rate": 1.1103896103896104e-07, + "logits/chosen": -1.7305679321289062, + "logits/rejected": -1.687849521636963, + "logps/chosen": -497.699462890625, + "logps/rejected": -470.3168640136719, + "loss": 0.69, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.013295488432049751, + "rewards/margins": 0.0023743584752082825, + "rewards/rejected": -0.015669845044612885, + "step": 114 + }, + { + "epoch": 0.07510817209568128, + "grad_norm": 16.441138226102247, + "learning_rate": 1.12012987012987e-07, + "logits/chosen": -1.7420533895492554, + "logits/rejected": -1.7339985370635986, + "logps/chosen": -581.6981201171875, + "logps/rejected": -568.88623046875, + "loss": 0.6899, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.014405852183699608, + "rewards/margins": 0.004987373016774654, + "rewards/rejected": -0.019393224269151688, + "step": 115 + }, + { + "epoch": 0.07576128663564373, + "grad_norm": 8.586529812749172, + "learning_rate": 1.1298701298701299e-07, + "logits/chosen": -1.7932806015014648, + "logits/rejected": -1.7588953971862793, + "logps/chosen": -532.9600830078125, + "logps/rejected": -497.33770751953125, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.44545429572463e-05, + "rewards/margins": 0.00019355083350092173, + "rewards/rejected": -0.0002780060167424381, + "step": 116 + }, + { + "epoch": 0.07641440117560618, + "grad_norm": 11.94419407737389, + "learning_rate": 1.1396103896103895e-07, + "logits/chosen": -1.7573413848876953, + "logits/rejected": -1.7641370296478271, + "logps/chosen": -529.7691650390625, + "logps/rejected": -494.5425109863281, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009526774287223816, + "rewards/margins": 0.004622914828360081, + "rewards/rejected": -0.014149690046906471, + "step": 117 + }, + { + "epoch": 0.07706751571556862, + "grad_norm": 8.288500408438793, + "learning_rate": 1.1493506493506493e-07, + "logits/chosen": -1.6377328634262085, + "logits/rejected": -1.7309472560882568, + "logps/chosen": -479.968505859375, + "logps/rejected": -474.0822448730469, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0036013172939419746, + "rewards/margins": 0.0017371103167533875, + "rewards/rejected": -0.005338427610695362, + "step": 118 + }, + { + "epoch": 0.07772063025553107, + "grad_norm": 7.65494131386099, + "learning_rate": 1.159090909090909e-07, + "logits/chosen": -1.720550298690796, + "logits/rejected": -1.7268420457839966, + "logps/chosen": -511.4947814941406, + "logps/rejected": -518.540283203125, + "loss": 0.6886, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.002116803778335452, + "rewards/margins": 0.014177341014146805, + "rewards/rejected": -0.016294144093990326, + "step": 119 + }, + { + "epoch": 0.07837374479549351, + "grad_norm": 7.87167890173762, + "learning_rate": 1.1688311688311688e-07, + "logits/chosen": -1.6774556636810303, + "logits/rejected": -1.663172960281372, + "logps/chosen": -486.4458923339844, + "logps/rejected": -449.217529296875, + "loss": 0.6893, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.014286703430116177, + "rewards/margins": 0.00825961772352457, + "rewards/rejected": -0.022546321153640747, + "step": 120 + }, + { + "epoch": 0.07902685933545596, + "grad_norm": 6.91850162308767, + "learning_rate": 1.1785714285714285e-07, + "logits/chosen": -1.694111943244934, + "logits/rejected": -1.7094212770462036, + "logps/chosen": -547.680419921875, + "logps/rejected": -498.9229736328125, + "loss": 0.6889, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.003534555435180664, + "rewards/margins": 0.00987851619720459, + "rewards/rejected": -0.013413071632385254, + "step": 121 + }, + { + "epoch": 0.0796799738754184, + "grad_norm": 9.143990922203699, + "learning_rate": 1.1883116883116883e-07, + "logits/chosen": -1.6494725942611694, + "logits/rejected": -1.6601428985595703, + "logps/chosen": -484.3493347167969, + "logps/rejected": -487.05059814453125, + "loss": 0.6897, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.014209108427166939, + "rewards/margins": 0.0037215554621070623, + "rewards/rejected": -0.017930667847394943, + "step": 122 + }, + { + "epoch": 0.08033308841538085, + "grad_norm": 16.41656810337495, + "learning_rate": 1.198051948051948e-07, + "logits/chosen": -1.7612290382385254, + "logits/rejected": -1.7043516635894775, + "logps/chosen": -504.9853210449219, + "logps/rejected": -622.8348999023438, + "loss": 0.6853, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.017526667565107346, + "rewards/margins": 0.025080684572458267, + "rewards/rejected": -0.04260735213756561, + "step": 123 + }, + { + "epoch": 0.0809862029553433, + "grad_norm": 25.637817852859747, + "learning_rate": 1.207792207792208e-07, + "logits/chosen": -1.6937708854675293, + "logits/rejected": -1.6883544921875, + "logps/chosen": -384.4920959472656, + "logps/rejected": -507.6640930175781, + "loss": 0.6885, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.024008184671401978, + "rewards/margins": 0.0192131157964468, + "rewards/rejected": -0.04322130233049393, + "step": 124 + }, + { + "epoch": 0.08163931749530574, + "grad_norm": 22.15806914900568, + "learning_rate": 1.2175324675324674e-07, + "logits/chosen": -1.5668435096740723, + "logits/rejected": -1.5370216369628906, + "logps/chosen": -491.66278076171875, + "logps/rejected": -550.0761108398438, + "loss": 0.6886, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.030099207535386086, + "rewards/margins": 0.012964273802936077, + "rewards/rejected": -0.04306348040699959, + "step": 125 + }, + { + "epoch": 0.08229243203526819, + "grad_norm": 21.70535951365313, + "learning_rate": 1.2272727272727272e-07, + "logits/chosen": -1.718205451965332, + "logits/rejected": -1.7234244346618652, + "logps/chosen": -532.9760131835938, + "logps/rejected": -553.8611450195312, + "loss": 0.6876, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.028685756027698517, + "rewards/margins": 0.005702130496501923, + "rewards/rejected": -0.03438788652420044, + "step": 126 + }, + { + "epoch": 0.08294554657523064, + "grad_norm": 16.27604402538048, + "learning_rate": 1.237012987012987e-07, + "logits/chosen": -1.7105181217193604, + "logits/rejected": -1.7274971008300781, + "logps/chosen": -452.66326904296875, + "logps/rejected": -424.0040588378906, + "loss": 0.6882, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.023135703057050705, + "rewards/margins": 0.00477581936866045, + "rewards/rejected": -0.02791152149438858, + "step": 127 + }, + { + "epoch": 0.08359866111519308, + "grad_norm": 8.360473540141022, + "learning_rate": 1.2467532467532469e-07, + "logits/chosen": -1.7243715524673462, + "logits/rejected": -1.719879388809204, + "logps/chosen": -577.6858520507812, + "logps/rejected": -574.778564453125, + "loss": 0.6875, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.01625608652830124, + "rewards/margins": 0.01795053295791149, + "rewards/rejected": -0.03420662134885788, + "step": 128 + }, + { + "epoch": 0.08425177565515553, + "grad_norm": 7.426631483908922, + "learning_rate": 1.2564935064935064e-07, + "logits/chosen": -1.6296292543411255, + "logits/rejected": -1.645986557006836, + "logps/chosen": -514.0096435546875, + "logps/rejected": -507.37237548828125, + "loss": 0.6876, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00854131206870079, + "rewards/margins": 0.007741453126072884, + "rewards/rejected": -0.016282765194773674, + "step": 129 + }, + { + "epoch": 0.08490489019511797, + "grad_norm": 12.61939534454806, + "learning_rate": 1.2662337662337662e-07, + "logits/chosen": -1.7628737688064575, + "logits/rejected": -1.784444808959961, + "logps/chosen": -515.55419921875, + "logps/rejected": -445.7431640625, + "loss": 0.6873, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.026726404204964638, + "rewards/margins": 0.007864664308726788, + "rewards/rejected": -0.03459106758236885, + "step": 130 + }, + { + "epoch": 0.08555800473508042, + "grad_norm": 6.940242974613805, + "learning_rate": 1.275974025974026e-07, + "logits/chosen": -1.723580002784729, + "logits/rejected": -1.7397780418395996, + "logps/chosen": -490.61956787109375, + "logps/rejected": -519.4144287109375, + "loss": 0.686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02750040590763092, + "rewards/margins": 0.028302742168307304, + "rewards/rejected": -0.055803146213293076, + "step": 131 + }, + { + "epoch": 0.08621111927504287, + "grad_norm": 9.176637689468706, + "learning_rate": 1.2857142857142855e-07, + "logits/chosen": -1.7846028804779053, + "logits/rejected": -1.7753349542617798, + "logps/chosen": -537.8035278320312, + "logps/rejected": -657.7811889648438, + "loss": 0.6844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.039286255836486816, + "rewards/margins": 0.02002215012907982, + "rewards/rejected": -0.059308405965566635, + "step": 132 + }, + { + "epoch": 0.08686423381500531, + "grad_norm": 8.420310076033147, + "learning_rate": 1.2954545454545453e-07, + "logits/chosen": -1.7075575590133667, + "logits/rejected": -1.7227400541305542, + "logps/chosen": -505.66326904296875, + "logps/rejected": -475.65240478515625, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03044586256146431, + "rewards/margins": 0.011737257242202759, + "rewards/rejected": -0.04218312352895737, + "step": 133 + }, + { + "epoch": 0.08751734835496776, + "grad_norm": 17.816465764608324, + "learning_rate": 1.3051948051948052e-07, + "logits/chosen": -1.6720120906829834, + "logits/rejected": -1.6279743909835815, + "logps/chosen": -497.46038818359375, + "logps/rejected": -475.72442626953125, + "loss": 0.6897, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02933063544332981, + "rewards/margins": 0.009470460936427116, + "rewards/rejected": -0.03880109637975693, + "step": 134 + }, + { + "epoch": 0.0881704628949302, + "grad_norm": 8.48602261690419, + "learning_rate": 1.314935064935065e-07, + "logits/chosen": -1.6978920698165894, + "logits/rejected": -1.7289825677871704, + "logps/chosen": -530.339111328125, + "logps/rejected": -493.29913330078125, + "loss": 0.6867, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.02855011634528637, + "rewards/margins": 0.0154922716319561, + "rewards/rejected": -0.04404238611459732, + "step": 135 + }, + { + "epoch": 0.08882357743489265, + "grad_norm": 9.366599761861622, + "learning_rate": 1.3246753246753245e-07, + "logits/chosen": -1.7024781703948975, + "logits/rejected": -1.6904100179672241, + "logps/chosen": -410.39215087890625, + "logps/rejected": -436.4376525878906, + "loss": 0.6859, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.012004725635051727, + "rewards/margins": 0.023064523935317993, + "rewards/rejected": -0.03506924957036972, + "step": 136 + }, + { + "epoch": 0.0894766919748551, + "grad_norm": 17.561997807750693, + "learning_rate": 1.3344155844155843e-07, + "logits/chosen": -1.6883275508880615, + "logits/rejected": -1.6822022199630737, + "logps/chosen": -470.7867736816406, + "logps/rejected": -476.1307067871094, + "loss": 0.6862, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04406014457345009, + "rewards/margins": 0.020365647971630096, + "rewards/rejected": -0.06442578881978989, + "step": 137 + }, + { + "epoch": 0.09012980651481754, + "grad_norm": 6.311518850983097, + "learning_rate": 1.344155844155844e-07, + "logits/chosen": -1.6198405027389526, + "logits/rejected": -1.6445224285125732, + "logps/chosen": -577.4093627929688, + "logps/rejected": -524.4210815429688, + "loss": 0.6889, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.06365535408258438, + "rewards/margins": -0.0076623717322945595, + "rewards/rejected": -0.0559929758310318, + "step": 138 + }, + { + "epoch": 0.09078292105477999, + "grad_norm": 7.902620448969978, + "learning_rate": 1.353896103896104e-07, + "logits/chosen": -1.626230001449585, + "logits/rejected": -1.6832174062728882, + "logps/chosen": -452.2579650878906, + "logps/rejected": -457.4361267089844, + "loss": 0.6848, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.046750400215387344, + "rewards/margins": 0.018618889153003693, + "rewards/rejected": -0.06536928564310074, + "step": 139 + }, + { + "epoch": 0.09143603559474243, + "grad_norm": 9.061462627285719, + "learning_rate": 1.3636363636363635e-07, + "logits/chosen": -1.786929965019226, + "logits/rejected": -1.748306393623352, + "logps/chosen": -477.7335510253906, + "logps/rejected": -470.07623291015625, + "loss": 0.6823, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.05674988403916359, + "rewards/margins": 0.019451485946774483, + "rewards/rejected": -0.07620137184858322, + "step": 140 + }, + { + "epoch": 0.09208915013470488, + "grad_norm": 8.27691403538538, + "learning_rate": 1.3733766233766233e-07, + "logits/chosen": -1.7523396015167236, + "logits/rejected": -1.750685691833496, + "logps/chosen": -544.35888671875, + "logps/rejected": -523.8983154296875, + "loss": 0.6861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04826396703720093, + "rewards/margins": 0.019393280148506165, + "rewards/rejected": -0.06765724718570709, + "step": 141 + }, + { + "epoch": 0.09274226467466731, + "grad_norm": 40.62514167248459, + "learning_rate": 1.383116883116883e-07, + "logits/chosen": -1.6654901504516602, + "logits/rejected": -1.6239811182022095, + "logps/chosen": -479.25079345703125, + "logps/rejected": -476.67572021484375, + "loss": 0.6897, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.045496560633182526, + "rewards/margins": 0.007325804326683283, + "rewards/rejected": -0.05282236635684967, + "step": 142 + }, + { + "epoch": 0.09339537921462976, + "grad_norm": 28.906389537639807, + "learning_rate": 1.392857142857143e-07, + "logits/chosen": -1.805418848991394, + "logits/rejected": -1.7966678142547607, + "logps/chosen": -521.4232788085938, + "logps/rejected": -504.65008544921875, + "loss": 0.6839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.043921858072280884, + "rewards/margins": 0.02651369199156761, + "rewards/rejected": -0.0704355537891388, + "step": 143 + }, + { + "epoch": 0.0940484937545922, + "grad_norm": 7.759027403093762, + "learning_rate": 1.4025974025974024e-07, + "logits/chosen": -1.808532953262329, + "logits/rejected": -1.7720710039138794, + "logps/chosen": -530.4207763671875, + "logps/rejected": -543.4842529296875, + "loss": 0.6823, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.058577775955200195, + "rewards/margins": 0.01986054703593254, + "rewards/rejected": -0.07843831926584244, + "step": 144 + }, + { + "epoch": 0.09470160829455465, + "grad_norm": 19.65781688946105, + "learning_rate": 1.4123376623376622e-07, + "logits/chosen": -1.7204667329788208, + "logits/rejected": -1.6994764804840088, + "logps/chosen": -600.3792724609375, + "logps/rejected": -596.1019897460938, + "loss": 0.6858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07191239297389984, + "rewards/margins": 0.02992035634815693, + "rewards/rejected": -0.10183274745941162, + "step": 145 + }, + { + "epoch": 0.0953547228345171, + "grad_norm": 27.083668342567048, + "learning_rate": 1.422077922077922e-07, + "logits/chosen": -1.6221792697906494, + "logits/rejected": -1.6598668098449707, + "logps/chosen": -526.0675048828125, + "logps/rejected": -521.8135986328125, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.042592283338308334, + "rewards/margins": 0.012567641213536263, + "rewards/rejected": -0.055159930139780045, + "step": 146 + }, + { + "epoch": 0.09600783737447954, + "grad_norm": 8.808375433626706, + "learning_rate": 1.4318181818181818e-07, + "logits/chosen": -1.8164383172988892, + "logits/rejected": -1.8282561302185059, + "logps/chosen": -614.2815551757812, + "logps/rejected": -643.0169067382812, + "loss": 0.6865, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07687513530254364, + "rewards/margins": 0.012751526199281216, + "rewards/rejected": -0.08962665498256683, + "step": 147 + }, + { + "epoch": 0.09666095191444199, + "grad_norm": 7.09253863840217, + "learning_rate": 1.4415584415584414e-07, + "logits/chosen": -1.6832506656646729, + "logits/rejected": -1.7021249532699585, + "logps/chosen": -498.71942138671875, + "logps/rejected": -501.34869384765625, + "loss": 0.6838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.054647862911224365, + "rewards/margins": 0.026497341692447662, + "rewards/rejected": -0.08114521205425262, + "step": 148 + }, + { + "epoch": 0.09731406645440444, + "grad_norm": 16.781059240998136, + "learning_rate": 1.4512987012987012e-07, + "logits/chosen": -1.715110182762146, + "logits/rejected": -1.7118535041809082, + "logps/chosen": -474.4345703125, + "logps/rejected": -454.8081970214844, + "loss": 0.6855, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.062017131596803665, + "rewards/margins": 0.004093126859515905, + "rewards/rejected": -0.06611025333404541, + "step": 149 + }, + { + "epoch": 0.09796718099436688, + "grad_norm": 20.190214518687835, + "learning_rate": 1.461038961038961e-07, + "logits/chosen": -1.6403260231018066, + "logits/rejected": -1.601685881614685, + "logps/chosen": -516.1248168945312, + "logps/rejected": -536.0068969726562, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05097360908985138, + "rewards/margins": 0.021962041035294533, + "rewards/rejected": -0.07293565571308136, + "step": 150 + }, + { + "epoch": 0.09862029553432933, + "grad_norm": 12.205029148852612, + "learning_rate": 1.4707792207792208e-07, + "logits/chosen": -1.6476508378982544, + "logits/rejected": -1.7027806043624878, + "logps/chosen": -488.97259521484375, + "logps/rejected": -461.6432800292969, + "loss": 0.681, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03291236236691475, + "rewards/margins": 0.01708255708217621, + "rewards/rejected": -0.04999491944909096, + "step": 151 + }, + { + "epoch": 0.09927341007429177, + "grad_norm": 22.040695719974778, + "learning_rate": 1.4805194805194803e-07, + "logits/chosen": -1.7001389265060425, + "logits/rejected": -1.7121036052703857, + "logps/chosen": -473.0397033691406, + "logps/rejected": -513.0086059570312, + "loss": 0.6773, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.05476941913366318, + "rewards/margins": 0.05117000639438629, + "rewards/rejected": -0.10593942552804947, + "step": 152 + }, + { + "epoch": 0.09992652461425422, + "grad_norm": 17.736405376818148, + "learning_rate": 1.4902597402597404e-07, + "logits/chosen": -1.7151176929473877, + "logits/rejected": -1.6587436199188232, + "logps/chosen": -528.3651733398438, + "logps/rejected": -519.2221069335938, + "loss": 0.6872, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.050955355167388916, + "rewards/margins": 0.02759256586432457, + "rewards/rejected": -0.07854791730642319, + "step": 153 + }, + { + "epoch": 0.10057963915421667, + "grad_norm": 20.0575565981101, + "learning_rate": 1.5e-07, + "logits/chosen": -1.7852433919906616, + "logits/rejected": -1.7886905670166016, + "logps/chosen": -482.718017578125, + "logps/rejected": -482.28759765625, + "loss": 0.6837, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07595521211624146, + "rewards/margins": 0.016735542565584183, + "rewards/rejected": -0.09269075095653534, + "step": 154 + }, + { + "epoch": 0.10123275369417911, + "grad_norm": 6.032126948811811, + "learning_rate": 1.499998048075819e-07, + "logits/chosen": -1.649161696434021, + "logits/rejected": -1.6406943798065186, + "logps/chosen": -509.6943359375, + "logps/rejected": -504.8258361816406, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07708139717578888, + "rewards/margins": 0.013607650063931942, + "rewards/rejected": -0.0906890481710434, + "step": 155 + }, + { + "epoch": 0.10188586823414156, + "grad_norm": 8.923286598087802, + "learning_rate": 1.4999921923134367e-07, + "logits/chosen": -1.7759987115859985, + "logits/rejected": -1.731602668762207, + "logps/chosen": -508.0286865234375, + "logps/rejected": -549.663330078125, + "loss": 0.6864, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04344085976481438, + "rewards/margins": 0.024824578315019608, + "rewards/rejected": -0.06826544553041458, + "step": 156 + }, + { + "epoch": 0.102538982774104, + "grad_norm": 18.440082543584566, + "learning_rate": 1.499982432743333e-07, + "logits/chosen": -1.7433996200561523, + "logits/rejected": -1.7404325008392334, + "logps/chosen": -483.1097717285156, + "logps/rejected": -518.4947509765625, + "loss": 0.682, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.07219300419092178, + "rewards/margins": 0.0072326865047216415, + "rewards/rejected": -0.07942568510770798, + "step": 157 + }, + { + "epoch": 0.10319209731406645, + "grad_norm": 26.950148340749994, + "learning_rate": 1.4999687694163071e-07, + "logits/chosen": -1.6927975416183472, + "logits/rejected": -1.7593554258346558, + "logps/chosen": -587.9935302734375, + "logps/rejected": -547.7039184570312, + "loss": 0.6831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05381292477250099, + "rewards/margins": 0.02101278305053711, + "rewards/rejected": -0.0748257115483284, + "step": 158 + }, + { + "epoch": 0.1038452118540289, + "grad_norm": 7.542663745310761, + "learning_rate": 1.499951202403479e-07, + "logits/chosen": -1.7588621377944946, + "logits/rejected": -1.703225016593933, + "logps/chosen": -505.0411376953125, + "logps/rejected": -599.8470458984375, + "loss": 0.6763, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05922378972172737, + "rewards/margins": 0.053594160825014114, + "rewards/rejected": -0.11281795799732208, + "step": 159 + }, + { + "epoch": 0.10449832639399134, + "grad_norm": 17.13212774750112, + "learning_rate": 1.4999297317962876e-07, + "logits/chosen": -1.7104802131652832, + "logits/rejected": -1.7329843044281006, + "logps/chosen": -514.36279296875, + "logps/rejected": -533.5743408203125, + "loss": 0.6821, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07126364856958389, + "rewards/margins": 0.031774699687957764, + "rewards/rejected": -0.10303835570812225, + "step": 160 + }, + { + "epoch": 0.10515144093395379, + "grad_norm": 10.645728177870987, + "learning_rate": 1.4999043577064894e-07, + "logits/chosen": -1.826465129852295, + "logits/rejected": -1.8719959259033203, + "logps/chosen": -616.4337768554688, + "logps/rejected": -609.4326171875, + "loss": 0.6779, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07613222301006317, + "rewards/margins": 0.02693980187177658, + "rewards/rejected": -0.10307201743125916, + "step": 161 + }, + { + "epoch": 0.10580455547391623, + "grad_norm": 31.5882679445136, + "learning_rate": 1.4998750802661605e-07, + "logits/chosen": -1.727925419807434, + "logits/rejected": -1.724989414215088, + "logps/chosen": -599.0576171875, + "logps/rejected": -563.6280517578125, + "loss": 0.683, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08880521357059479, + "rewards/margins": 0.025629587471485138, + "rewards/rejected": -0.11443479359149933, + "step": 162 + }, + { + "epoch": 0.10645767001387868, + "grad_norm": 12.592298356038635, + "learning_rate": 1.4998418996276933e-07, + "logits/chosen": -1.738313913345337, + "logits/rejected": -1.753507375717163, + "logps/chosen": -579.961669921875, + "logps/rejected": -594.0586547851562, + "loss": 0.6819, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05988356098532677, + "rewards/margins": 0.03945886343717575, + "rewards/rejected": -0.09934242069721222, + "step": 163 + }, + { + "epoch": 0.10711078455384113, + "grad_norm": 19.46009999388665, + "learning_rate": 1.499804815963798e-07, + "logits/chosen": -1.7987678050994873, + "logits/rejected": -1.807891845703125, + "logps/chosen": -550.672119140625, + "logps/rejected": -561.8424072265625, + "loss": 0.677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.040339332073926926, + "rewards/margins": 0.04034976288676262, + "rewards/rejected": -0.08068908751010895, + "step": 164 + }, + { + "epoch": 0.10776389909380357, + "grad_norm": 27.510358333266502, + "learning_rate": 1.4997638294674996e-07, + "logits/chosen": -1.7597366571426392, + "logits/rejected": -1.7664936780929565, + "logps/chosen": -517.5831909179688, + "logps/rejected": -524.0874633789062, + "loss": 0.6807, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10509812831878662, + "rewards/margins": 0.02205245941877365, + "rewards/rejected": -0.12715059518814087, + "step": 165 + }, + { + "epoch": 0.10841701363376602, + "grad_norm": 26.359728514103736, + "learning_rate": 1.499718940352138e-07, + "logits/chosen": -1.660881519317627, + "logits/rejected": -1.6824941635131836, + "logps/chosen": -526.3524169921875, + "logps/rejected": -505.15814208984375, + "loss": 0.6795, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09450910985469818, + "rewards/margins": 0.04056902229785919, + "rewards/rejected": -0.13507813215255737, + "step": 166 + }, + { + "epoch": 0.10907012817372846, + "grad_norm": 13.379229704663794, + "learning_rate": 1.499670148851367e-07, + "logits/chosen": -1.6919353008270264, + "logits/rejected": -1.7081630229949951, + "logps/chosen": -462.4615783691406, + "logps/rejected": -441.3831787109375, + "loss": 0.6793, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.047392651438713074, + "rewards/margins": 0.02708030305802822, + "rewards/rejected": -0.07447294890880585, + "step": 167 + }, + { + "epoch": 0.10972324271369091, + "grad_norm": 23.765383175487347, + "learning_rate": 1.4996174552191534e-07, + "logits/chosen": -1.6622581481933594, + "logits/rejected": -1.679819941520691, + "logps/chosen": -464.3515930175781, + "logps/rejected": -449.2407531738281, + "loss": 0.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.059018295258283615, + "rewards/margins": 0.03328193724155426, + "rewards/rejected": -0.09230024367570877, + "step": 168 + }, + { + "epoch": 0.11037635725365336, + "grad_norm": 7.18145864960063, + "learning_rate": 1.4995608597297736e-07, + "logits/chosen": -1.7335408926010132, + "logits/rejected": -1.7119437456130981, + "logps/chosen": -517.2203979492188, + "logps/rejected": -603.5662841796875, + "loss": 0.6748, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.06651943922042847, + "rewards/margins": 0.05973542109131813, + "rewards/rejected": -0.1262548714876175, + "step": 169 + }, + { + "epoch": 0.1110294717936158, + "grad_norm": 33.768142398887186, + "learning_rate": 1.4995003626778149e-07, + "logits/chosen": -1.7551608085632324, + "logits/rejected": -1.718156099319458, + "logps/chosen": -503.027099609375, + "logps/rejected": -557.4558715820312, + "loss": 0.685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06958004832267761, + "rewards/margins": 0.021374018862843513, + "rewards/rejected": -0.09095406532287598, + "step": 170 + }, + { + "epoch": 0.11168258633357825, + "grad_norm": 13.332558234776263, + "learning_rate": 1.4994359643781725e-07, + "logits/chosen": -1.6308660507202148, + "logits/rejected": -1.6401119232177734, + "logps/chosen": -491.20263671875, + "logps/rejected": -492.4757995605469, + "loss": 0.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08171894401311874, + "rewards/margins": 0.03456515446305275, + "rewards/rejected": -0.1162840873003006, + "step": 171 + }, + { + "epoch": 0.1123357008735407, + "grad_norm": 11.374539796821198, + "learning_rate": 1.4993676651660479e-07, + "logits/chosen": -1.7294381856918335, + "logits/rejected": -1.7581063508987427, + "logps/chosen": -510.65618896484375, + "logps/rejected": -494.7225341796875, + "loss": 0.6791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07346140593290329, + "rewards/margins": 0.016454335302114487, + "rewards/rejected": -0.08991573750972748, + "step": 172 + }, + { + "epoch": 0.11298881541350314, + "grad_norm": 10.782302320724707, + "learning_rate": 1.4992954653969473e-07, + "logits/chosen": -1.7590150833129883, + "logits/rejected": -1.7065110206604004, + "logps/chosen": -579.6332397460938, + "logps/rejected": -642.480224609375, + "loss": 0.6756, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16745489835739136, + "rewards/margins": 0.04030359163880348, + "rewards/rejected": -0.20775848627090454, + "step": 173 + }, + { + "epoch": 0.11364192995346559, + "grad_norm": 30.030650307666996, + "learning_rate": 1.4992193654466804e-07, + "logits/chosen": -1.7184665203094482, + "logits/rejected": -1.751639485359192, + "logps/chosen": -505.05078125, + "logps/rejected": -476.3111572265625, + "loss": 0.6774, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07360526919364929, + "rewards/margins": 0.02522566169500351, + "rewards/rejected": -0.0988309308886528, + "step": 174 + }, + { + "epoch": 0.11429504449342803, + "grad_norm": 6.801707032164358, + "learning_rate": 1.4991393657113566e-07, + "logits/chosen": -1.7240628004074097, + "logits/rejected": -1.693426489830017, + "logps/chosen": -513.940673828125, + "logps/rejected": -593.656494140625, + "loss": 0.6822, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.09224756807088852, + "rewards/margins": 0.05458322912454605, + "rewards/rejected": -0.14683078229427338, + "step": 175 + }, + { + "epoch": 0.11494815903339048, + "grad_norm": 6.667305623368847, + "learning_rate": 1.499055466607386e-07, + "logits/chosen": -1.6916465759277344, + "logits/rejected": -1.7375673055648804, + "logps/chosen": -533.5892944335938, + "logps/rejected": -507.4823913574219, + "loss": 0.675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.080963134765625, + "rewards/margins": 0.053236886858940125, + "rewards/rejected": -0.13420002162456512, + "step": 176 + }, + { + "epoch": 0.11560127357335293, + "grad_norm": 6.771588986223236, + "learning_rate": 1.498967668571474e-07, + "logits/chosen": -1.735141396522522, + "logits/rejected": -1.7742522954940796, + "logps/chosen": -510.868408203125, + "logps/rejected": -452.6504211425781, + "loss": 0.6749, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10077373683452606, + "rewards/margins": 0.003773924894630909, + "rewards/rejected": -0.1045476570725441, + "step": 177 + }, + { + "epoch": 0.11625438811331537, + "grad_norm": 9.953482315244722, + "learning_rate": 1.4988759720606207e-07, + "logits/chosen": -1.7570782899856567, + "logits/rejected": -1.8162400722503662, + "logps/chosen": -554.0537719726562, + "logps/rejected": -541.5122680664062, + "loss": 0.6801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10425091534852982, + "rewards/margins": 0.060477737337350845, + "rewards/rejected": -0.16472867131233215, + "step": 178 + }, + { + "epoch": 0.11690750265327782, + "grad_norm": 5.973916986801873, + "learning_rate": 1.4987803775521184e-07, + "logits/chosen": -1.690962791442871, + "logits/rejected": -1.689450979232788, + "logps/chosen": -643.9844360351562, + "logps/rejected": -589.926025390625, + "loss": 0.6829, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.12561464309692383, + "rewards/margins": -0.008268720470368862, + "rewards/rejected": -0.11734593659639359, + "step": 179 + }, + { + "epoch": 0.11756061719324026, + "grad_norm": 14.891949152028998, + "learning_rate": 1.4986808855435498e-07, + "logits/chosen": -1.6953084468841553, + "logits/rejected": -1.6965618133544922, + "logps/chosen": -465.82086181640625, + "logps/rejected": -438.37799072265625, + "loss": 0.6767, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08036669343709946, + "rewards/margins": 0.017360081896185875, + "rewards/rejected": -0.09772677719593048, + "step": 180 + }, + { + "epoch": 0.11821373173320271, + "grad_norm": 16.11153067050427, + "learning_rate": 1.498577496552783e-07, + "logits/chosen": -1.7526249885559082, + "logits/rejected": -1.7759355306625366, + "logps/chosen": -482.6202697753906, + "logps/rejected": -492.8947448730469, + "loss": 0.6659, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09437385201454163, + "rewards/margins": 0.037018630653619766, + "rewards/rejected": -0.1313924789428711, + "step": 181 + }, + { + "epoch": 0.11886684627316516, + "grad_norm": 22.729237496040554, + "learning_rate": 1.4984702111179715e-07, + "logits/chosen": -1.6132816076278687, + "logits/rejected": -1.655545949935913, + "logps/chosen": -527.8369750976562, + "logps/rejected": -527.8175048828125, + "loss": 0.6727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0954374372959137, + "rewards/margins": 0.05533631891012192, + "rewards/rejected": -0.1507737785577774, + "step": 182 + }, + { + "epoch": 0.1195199608131276, + "grad_norm": 26.231353893622398, + "learning_rate": 1.4983590297975505e-07, + "logits/chosen": -1.6354296207427979, + "logits/rejected": -1.6777453422546387, + "logps/chosen": -504.5555419921875, + "logps/rejected": -538.9359741210938, + "loss": 0.6843, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12271232157945633, + "rewards/margins": 0.04234550893306732, + "rewards/rejected": -0.16505783796310425, + "step": 183 + }, + { + "epoch": 0.12017307535309005, + "grad_norm": 7.114753478180882, + "learning_rate": 1.498243953170233e-07, + "logits/chosen": -1.69046950340271, + "logits/rejected": -1.7043442726135254, + "logps/chosen": -530.9853515625, + "logps/rejected": -529.0479125976562, + "loss": 0.677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06343412399291992, + "rewards/margins": 0.035453617572784424, + "rewards/rejected": -0.09888774156570435, + "step": 184 + }, + { + "epoch": 0.1208261898930525, + "grad_norm": 21.787798884018414, + "learning_rate": 1.498124981835008e-07, + "logits/chosen": -1.7766292095184326, + "logits/rejected": -1.74836266040802, + "logps/chosen": -475.3532409667969, + "logps/rejected": -489.02923583984375, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07969984412193298, + "rewards/margins": 0.041919078677892685, + "rewards/rejected": -0.12161892652511597, + "step": 185 + }, + { + "epoch": 0.12147930443301494, + "grad_norm": 23.709784617273577, + "learning_rate": 1.4980021164111366e-07, + "logits/chosen": -1.6516242027282715, + "logits/rejected": -1.673815369606018, + "logps/chosen": -455.3662109375, + "logps/rejected": -492.9649963378906, + "loss": 0.677, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.14689013361930847, + "rewards/margins": 0.02547987923026085, + "rewards/rejected": -0.1723700314760208, + "step": 186 + }, + { + "epoch": 0.12213241897297739, + "grad_norm": 12.348486008384063, + "learning_rate": 1.4978753575381498e-07, + "logits/chosen": -1.7523754835128784, + "logits/rejected": -1.745179533958435, + "logps/chosen": -503.9389343261719, + "logps/rejected": -510.8143615722656, + "loss": 0.6702, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.11973577737808228, + "rewards/margins": 0.05588344484567642, + "rewards/rejected": -0.1756192147731781, + "step": 187 + }, + { + "epoch": 0.12278553351293983, + "grad_norm": 13.48324499414794, + "learning_rate": 1.4977447058758439e-07, + "logits/chosen": -1.7630393505096436, + "logits/rejected": -1.7760398387908936, + "logps/chosen": -571.8206787109375, + "logps/rejected": -541.6646118164062, + "loss": 0.6801, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11507945507764816, + "rewards/margins": 0.029867036268115044, + "rewards/rejected": -0.14494650065898895, + "step": 188 + }, + { + "epoch": 0.12343864805290228, + "grad_norm": 10.415485511799975, + "learning_rate": 1.4976101621042783e-07, + "logits/chosen": -1.684414267539978, + "logits/rejected": -1.6862590312957764, + "logps/chosen": -535.666015625, + "logps/rejected": -548.4473876953125, + "loss": 0.6756, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1297360062599182, + "rewards/margins": 0.05931150168180466, + "rewards/rejected": -0.18904751539230347, + "step": 189 + }, + { + "epoch": 0.12409176259286472, + "grad_norm": 6.350273320282756, + "learning_rate": 1.4974717269237708e-07, + "logits/chosen": -1.6381902694702148, + "logits/rejected": -1.6570154428482056, + "logps/chosen": -497.8411560058594, + "logps/rejected": -489.9413757324219, + "loss": 0.6632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11118555068969727, + "rewards/margins": 0.06526104360818863, + "rewards/rejected": -0.17644661664962769, + "step": 190 + }, + { + "epoch": 0.12474487713282717, + "grad_norm": 6.038127824462166, + "learning_rate": 1.4973294010548946e-07, + "logits/chosen": -1.6311917304992676, + "logits/rejected": -1.6577972173690796, + "logps/chosen": -518.0223388671875, + "logps/rejected": -520.6466674804688, + "loss": 0.677, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15784503519535065, + "rewards/margins": 0.01725524663925171, + "rewards/rejected": -0.17510026693344116, + "step": 191 + }, + { + "epoch": 0.12539799167278962, + "grad_norm": 26.218558847948582, + "learning_rate": 1.4971831852384745e-07, + "logits/chosen": -1.6489263772964478, + "logits/rejected": -1.6216522455215454, + "logps/chosen": -570.0574951171875, + "logps/rejected": -592.376708984375, + "loss": 0.6842, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.17664462327957153, + "rewards/margins": 0.014460130594670773, + "rewards/rejected": -0.1911047399044037, + "step": 192 + }, + { + "epoch": 0.12605110621275206, + "grad_norm": 14.358619994205673, + "learning_rate": 1.497033080235583e-07, + "logits/chosen": -1.754329800605774, + "logits/rejected": -1.7733534574508667, + "logps/chosen": -515.3424682617188, + "logps/rejected": -505.7689208984375, + "loss": 0.6687, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12092112749814987, + "rewards/margins": 0.024929339066147804, + "rewards/rejected": -0.14585046470165253, + "step": 193 + }, + { + "epoch": 0.1267042207527145, + "grad_norm": 17.358492156799695, + "learning_rate": 1.4968790868275365e-07, + "logits/chosen": -1.666093349456787, + "logits/rejected": -1.6654276847839355, + "logps/chosen": -510.59320068359375, + "logps/rejected": -497.73431396484375, + "loss": 0.6717, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13687899708747864, + "rewards/margins": 0.009451786056160927, + "rewards/rejected": -0.1463308036327362, + "step": 194 + }, + { + "epoch": 0.12735733529267695, + "grad_norm": 33.350981012916925, + "learning_rate": 1.4967212058158908e-07, + "logits/chosen": -1.7516664266586304, + "logits/rejected": -1.7618868350982666, + "logps/chosen": -574.4312744140625, + "logps/rejected": -562.3271484375, + "loss": 0.687, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1302211880683899, + "rewards/margins": 0.03780483454465866, + "rewards/rejected": -0.16802600026130676, + "step": 195 + }, + { + "epoch": 0.1280104498326394, + "grad_norm": 12.460824021943772, + "learning_rate": 1.4965594380224373e-07, + "logits/chosen": -1.6539489030838013, + "logits/rejected": -1.6902060508728027, + "logps/chosen": -540.94873046875, + "logps/rejected": -570.3787231445312, + "loss": 0.6737, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16215386986732483, + "rewards/margins": 0.05639547482132912, + "rewards/rejected": -0.21854937076568604, + "step": 196 + }, + { + "epoch": 0.12866356437260185, + "grad_norm": 7.277120819141452, + "learning_rate": 1.4963937842891983e-07, + "logits/chosen": -1.7623552083969116, + "logits/rejected": -1.777740240097046, + "logps/chosen": -562.0401611328125, + "logps/rejected": -549.0805053710938, + "loss": 0.6625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15102756023406982, + "rewards/margins": 0.02262779325246811, + "rewards/rejected": -0.17365534603595734, + "step": 197 + }, + { + "epoch": 0.1293166789125643, + "grad_norm": 6.4071063075287045, + "learning_rate": 1.4962242454784235e-07, + "logits/chosen": -1.7132699489593506, + "logits/rejected": -1.6834791898727417, + "logps/chosen": -473.0800476074219, + "logps/rejected": -574.1148681640625, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13342300057411194, + "rewards/margins": 0.09166872501373291, + "rewards/rejected": -0.22509171068668365, + "step": 198 + }, + { + "epoch": 0.12996979345252674, + "grad_norm": 16.16942335499691, + "learning_rate": 1.4960508224725845e-07, + "logits/chosen": -1.697445273399353, + "logits/rejected": -1.737441897392273, + "logps/chosen": -491.8898010253906, + "logps/rejected": -465.43133544921875, + "loss": 0.6753, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12092840671539307, + "rewards/margins": 0.01965322345495224, + "rewards/rejected": -0.1405816376209259, + "step": 199 + }, + { + "epoch": 0.13062290799248918, + "grad_norm": 9.236224313907128, + "learning_rate": 1.495873516174371e-07, + "logits/chosen": -1.818574070930481, + "logits/rejected": -1.7804887294769287, + "logps/chosen": -583.37548828125, + "logps/rejected": -581.4929809570312, + "loss": 0.6703, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20678864419460297, + "rewards/margins": 0.013942277058959007, + "rewards/rejected": -0.22073093056678772, + "step": 200 + }, + { + "epoch": 0.13062290799248918, + "eval_logits/chosen": -1.7592895030975342, + "eval_logits/rejected": -1.7686011791229248, + "eval_logps/chosen": -524.639404296875, + "eval_logps/rejected": -522.0521240234375, + "eval_loss": 0.6713127493858337, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": -0.1429399996995926, + "eval_rewards/margins": 0.05518652871251106, + "eval_rewards/rejected": -0.19812652468681335, + "eval_runtime": 300.5974, + "eval_samples_per_second": 13.307, + "eval_steps_per_second": 0.832, + "step": 200 + }, + { + "epoch": 0.13127602253245163, + "grad_norm": 8.660547603879296, + "learning_rate": 1.4956923275066855e-07, + "logits/chosen": -1.7169371843338013, + "logits/rejected": -1.7529581785202026, + "logps/chosen": -491.5006408691406, + "logps/rejected": -438.540283203125, + "loss": 0.6728, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12306279689073563, + "rewards/margins": 0.035101909190416336, + "rewards/rejected": -0.15816472470760345, + "step": 201 + }, + { + "epoch": 0.13192913707241408, + "grad_norm": 9.941469801759618, + "learning_rate": 1.4955072574126383e-07, + "logits/chosen": -1.6325197219848633, + "logits/rejected": -1.6774578094482422, + "logps/chosen": -501.1746826171875, + "logps/rejected": -474.1672668457031, + "loss": 0.6778, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1591545194387436, + "rewards/margins": -0.0037236525677144527, + "rewards/rejected": -0.155430868268013, + "step": 202 + }, + { + "epoch": 0.13258225161237652, + "grad_norm": 6.629751952322596, + "learning_rate": 1.4953183068555444e-07, + "logits/chosen": -1.6159188747406006, + "logits/rejected": -1.655371069908142, + "logps/chosen": -511.7205810546875, + "logps/rejected": -497.550537109375, + "loss": 0.6755, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.17229245603084564, + "rewards/margins": 0.036124035716056824, + "rewards/rejected": -0.20841647684574127, + "step": 203 + }, + { + "epoch": 0.13323536615233897, + "grad_norm": 14.86037901564549, + "learning_rate": 1.4951254768189153e-07, + "logits/chosen": -1.7175514698028564, + "logits/rejected": -1.7305848598480225, + "logps/chosen": -521.8203735351562, + "logps/rejected": -473.41363525390625, + "loss": 0.6698, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1745811253786087, + "rewards/margins": 0.05103900283575058, + "rewards/rejected": -0.22562013566493988, + "step": 204 + }, + { + "epoch": 0.13388848069230141, + "grad_norm": 9.81928586229961, + "learning_rate": 1.4949287683064572e-07, + "logits/chosen": -1.7592101097106934, + "logits/rejected": -1.7103134393692017, + "logps/chosen": -506.8877258300781, + "logps/rejected": -512.57568359375, + "loss": 0.6752, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12894892692565918, + "rewards/margins": 0.058204442262649536, + "rewards/rejected": -0.1871533840894699, + "step": 205 + }, + { + "epoch": 0.13454159523226386, + "grad_norm": 12.70868918654191, + "learning_rate": 1.4947281823420636e-07, + "logits/chosen": -1.74403715133667, + "logits/rejected": -1.7773162126541138, + "logps/chosen": -549.9198608398438, + "logps/rejected": -537.7161865234375, + "loss": 0.6726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15765821933746338, + "rewards/margins": 0.024023467674851418, + "rewards/rejected": -0.18168169260025024, + "step": 206 + }, + { + "epoch": 0.1351947097722263, + "grad_norm": 9.464198406615232, + "learning_rate": 1.4945237199698105e-07, + "logits/chosen": -1.7218544483184814, + "logits/rejected": -1.7245031595230103, + "logps/chosen": -575.4227294921875, + "logps/rejected": -613.4913330078125, + "loss": 0.6628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2187643200159073, + "rewards/margins": 0.05670395866036415, + "rewards/rejected": -0.27546826004981995, + "step": 207 + }, + { + "epoch": 0.13584782431218875, + "grad_norm": 40.46016052410099, + "learning_rate": 1.4943153822539518e-07, + "logits/chosen": -1.7167556285858154, + "logits/rejected": -1.7805320024490356, + "logps/chosen": -536.693603515625, + "logps/rejected": -531.85986328125, + "loss": 0.6732, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18354004621505737, + "rewards/margins": 0.048775896430015564, + "rewards/rejected": -0.23231592774391174, + "step": 208 + }, + { + "epoch": 0.1365009388521512, + "grad_norm": 22.288492214742288, + "learning_rate": 1.4941031702789123e-07, + "logits/chosen": -1.6213892698287964, + "logits/rejected": -1.63565993309021, + "logps/chosen": -565.4679565429688, + "logps/rejected": -557.27685546875, + "loss": 0.6666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13010086119174957, + "rewards/margins": 0.07792194187641144, + "rewards/rejected": -0.20802278816699982, + "step": 209 + }, + { + "epoch": 0.13715405339211365, + "grad_norm": 15.197653036128015, + "learning_rate": 1.4938870851492834e-07, + "logits/chosen": -1.6974363327026367, + "logits/rejected": -1.7302565574645996, + "logps/chosen": -496.8112487792969, + "logps/rejected": -477.2409362792969, + "loss": 0.6659, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15533235669136047, + "rewards/margins": 0.022654909640550613, + "rewards/rejected": -0.1779872477054596, + "step": 210 + }, + { + "epoch": 0.1378071679320761, + "grad_norm": 6.3314279991311055, + "learning_rate": 1.4936671279898162e-07, + "logits/chosen": -1.7678776979446411, + "logits/rejected": -1.8228353261947632, + "logps/chosen": -473.0333557128906, + "logps/rejected": -480.84259033203125, + "loss": 0.6704, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2336454689502716, + "rewards/margins": 0.01017429493367672, + "rewards/rejected": -0.24381977319717407, + "step": 211 + }, + { + "epoch": 0.13846028247203854, + "grad_norm": 33.62952471238388, + "learning_rate": 1.493443299945417e-07, + "logits/chosen": -1.674032211303711, + "logits/rejected": -1.6666409969329834, + "logps/chosen": -487.1645202636719, + "logps/rejected": -511.31976318359375, + "loss": 0.6672, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17207323014736176, + "rewards/margins": 0.0362030491232872, + "rewards/rejected": -0.20827627182006836, + "step": 212 + }, + { + "epoch": 0.13911339701200098, + "grad_norm": 17.559552038629228, + "learning_rate": 1.4932156021811393e-07, + "logits/chosen": -1.6960467100143433, + "logits/rejected": -1.6814875602722168, + "logps/chosen": -448.228515625, + "logps/rejected": -523.8681640625, + "loss": 0.6702, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.19846728444099426, + "rewards/margins": 0.07104050368070602, + "rewards/rejected": -0.2695077657699585, + "step": 213 + }, + { + "epoch": 0.13976651155196343, + "grad_norm": 21.300381341216802, + "learning_rate": 1.492984035882181e-07, + "logits/chosen": -1.7540203332901, + "logits/rejected": -1.6938629150390625, + "logps/chosen": -510.3170166015625, + "logps/rejected": -555.1364135742188, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20609605312347412, + "rewards/margins": 0.04784277081489563, + "rewards/rejected": -0.25393882393836975, + "step": 214 + }, + { + "epoch": 0.14041962609192588, + "grad_norm": 18.999047527818195, + "learning_rate": 1.4927486022538743e-07, + "logits/chosen": -1.6708929538726807, + "logits/rejected": -1.712494969367981, + "logps/chosen": -605.5905151367188, + "logps/rejected": -571.502685546875, + "loss": 0.6654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21136337518692017, + "rewards/margins": 0.03826094791293144, + "rewards/rejected": -0.24962429702281952, + "step": 215 + }, + { + "epoch": 0.14107274063188832, + "grad_norm": 21.481084161977915, + "learning_rate": 1.4925093025216822e-07, + "logits/chosen": -1.6800730228424072, + "logits/rejected": -1.6440744400024414, + "logps/chosen": -561.3792724609375, + "logps/rejected": -543.5944213867188, + "loss": 0.6691, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.18766431510448456, + "rewards/margins": 0.08474057167768478, + "rewards/rejected": -0.27240487933158875, + "step": 216 + }, + { + "epoch": 0.14172585517185077, + "grad_norm": 36.19470917799039, + "learning_rate": 1.4922661379311916e-07, + "logits/chosen": -1.7101688385009766, + "logits/rejected": -1.7070766687393188, + "logps/chosen": -513.0838012695312, + "logps/rejected": -580.56982421875, + "loss": 0.6719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19527553021907806, + "rewards/margins": 0.07064682245254517, + "rewards/rejected": -0.26592230796813965, + "step": 217 + }, + { + "epoch": 0.1423789697118132, + "grad_norm": 19.07151886221254, + "learning_rate": 1.4920191097481055e-07, + "logits/chosen": -1.6426142454147339, + "logits/rejected": -1.6799873113632202, + "logps/chosen": -544.11083984375, + "logps/rejected": -474.7537536621094, + "loss": 0.6781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24504226446151733, + "rewards/margins": -0.005615321919322014, + "rewards/rejected": -0.23942697048187256, + "step": 218 + }, + { + "epoch": 0.14303208425177566, + "grad_norm": 16.177094066689747, + "learning_rate": 1.4917682192582382e-07, + "logits/chosen": -1.6813910007476807, + "logits/rejected": -1.7110719680786133, + "logps/chosen": -583.3339233398438, + "logps/rejected": -526.6600341796875, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2589135766029358, + "rewards/margins": -0.009151825681328773, + "rewards/rejected": -0.24976176023483276, + "step": 219 + }, + { + "epoch": 0.1436851987917381, + "grad_norm": 7.819380233896718, + "learning_rate": 1.4915134677675075e-07, + "logits/chosen": -1.6574629545211792, + "logits/rejected": -1.6859042644500732, + "logps/chosen": -482.84991455078125, + "logps/rejected": -488.5449523925781, + "loss": 0.6614, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.13272731006145477, + "rewards/margins": 0.06106055900454521, + "rewards/rejected": -0.1937878578901291, + "step": 220 + }, + { + "epoch": 0.14433831333170055, + "grad_norm": 24.184875753596092, + "learning_rate": 1.4912548566019288e-07, + "logits/chosen": -1.730996012687683, + "logits/rejected": -1.7523002624511719, + "logps/chosen": -550.131103515625, + "logps/rejected": -511.6734313964844, + "loss": 0.6859, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.16892360150814056, + "rewards/margins": 0.009203894063830376, + "rewards/rejected": -0.1781274825334549, + "step": 221 + }, + { + "epoch": 0.144991427871663, + "grad_norm": 40.392554453725786, + "learning_rate": 1.4909923871076067e-07, + "logits/chosen": -1.7876818180084229, + "logits/rejected": -1.8064486980438232, + "logps/chosen": -545.5269775390625, + "logps/rejected": -553.6494140625, + "loss": 0.663, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.21288356184959412, + "rewards/margins": 0.10687703639268875, + "rewards/rejected": -0.31976059079170227, + "step": 222 + }, + { + "epoch": 0.14564454241162544, + "grad_norm": 26.592793442117337, + "learning_rate": 1.4907260606507294e-07, + "logits/chosen": -1.7836247682571411, + "logits/rejected": -1.7736763954162598, + "logps/chosen": -491.8907470703125, + "logps/rejected": -510.7823181152344, + "loss": 0.6533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17148932814598083, + "rewards/margins": 0.10181709378957748, + "rewards/rejected": -0.27330639958381653, + "step": 223 + }, + { + "epoch": 0.1462976569515879, + "grad_norm": 7.602991933565122, + "learning_rate": 1.490455878617561e-07, + "logits/chosen": -1.7201324701309204, + "logits/rejected": -1.6427278518676758, + "logps/chosen": -493.15155029296875, + "logps/rejected": -565.255615234375, + "loss": 0.6665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16769693791866302, + "rewards/margins": 0.1270834058523178, + "rewards/rejected": -0.29478034377098083, + "step": 224 + }, + { + "epoch": 0.14695077149155034, + "grad_norm": 19.894237277380455, + "learning_rate": 1.4901818424144348e-07, + "logits/chosen": -1.7538783550262451, + "logits/rejected": -1.7509037256240845, + "logps/chosen": -613.3535766601562, + "logps/rejected": -623.0008544921875, + "loss": 0.6655, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.22273880243301392, + "rewards/margins": 0.08194239437580109, + "rewards/rejected": -0.3046812117099762, + "step": 225 + }, + { + "epoch": 0.14760388603151278, + "grad_norm": 27.20495808215994, + "learning_rate": 1.4899039534677446e-07, + "logits/chosen": -1.678489327430725, + "logits/rejected": -1.598201870918274, + "logps/chosen": -493.9471435546875, + "logps/rejected": -593.92333984375, + "loss": 0.6653, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23504018783569336, + "rewards/margins": 0.11126020550727844, + "rewards/rejected": -0.3463003933429718, + "step": 226 + }, + { + "epoch": 0.14825700057147523, + "grad_norm": 30.195854234112794, + "learning_rate": 1.489622213223939e-07, + "logits/chosen": -1.7436487674713135, + "logits/rejected": -1.712781310081482, + "logps/chosen": -514.0474243164062, + "logps/rejected": -565.7003784179688, + "loss": 0.6667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1994318962097168, + "rewards/margins": 0.1081019788980484, + "rewards/rejected": -0.307533860206604, + "step": 227 + }, + { + "epoch": 0.14891011511143767, + "grad_norm": 6.64214876236842, + "learning_rate": 1.4893366231495133e-07, + "logits/chosen": -1.6693284511566162, + "logits/rejected": -1.6889541149139404, + "logps/chosen": -538.619873046875, + "logps/rejected": -530.3651123046875, + "loss": 0.6658, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18542669713497162, + "rewards/margins": 0.06551727652549744, + "rewards/rejected": -0.25094395875930786, + "step": 228 + }, + { + "epoch": 0.14956322965140012, + "grad_norm": 19.211074462031995, + "learning_rate": 1.489047184731001e-07, + "logits/chosen": -1.784908413887024, + "logits/rejected": -1.801963210105896, + "logps/chosen": -545.5239868164062, + "logps/rejected": -563.0906372070312, + "loss": 0.6648, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20398400723934174, + "rewards/margins": 0.05913097783923149, + "rewards/rejected": -0.26311495900154114, + "step": 229 + }, + { + "epoch": 0.15021634419136257, + "grad_norm": 18.555655165809004, + "learning_rate": 1.488753899474967e-07, + "logits/chosen": -1.7692968845367432, + "logits/rejected": -1.7724251747131348, + "logps/chosen": -505.1445617675781, + "logps/rejected": -516.6723022460938, + "loss": 0.6724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21376638114452362, + "rewards/margins": 0.06437156349420547, + "rewards/rejected": -0.2781379520893097, + "step": 230 + }, + { + "epoch": 0.150869458731325, + "grad_norm": 12.537841664973817, + "learning_rate": 1.4884567689079993e-07, + "logits/chosen": -1.7249011993408203, + "logits/rejected": -1.7343446016311646, + "logps/chosen": -559.3654174804688, + "logps/rejected": -483.27349853515625, + "loss": 0.6834, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.23856452107429504, + "rewards/margins": -0.02549094706773758, + "rewards/rejected": -0.21307358145713806, + "step": 231 + }, + { + "epoch": 0.15152257327128746, + "grad_norm": 7.974565094982971, + "learning_rate": 1.4881557945767017e-07, + "logits/chosen": -1.6963640451431274, + "logits/rejected": -1.6877349615097046, + "logps/chosen": -534.5089111328125, + "logps/rejected": -530.7501220703125, + "loss": 0.6613, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2620963156223297, + "rewards/margins": 0.11971013993024826, + "rewards/rejected": -0.38180649280548096, + "step": 232 + }, + { + "epoch": 0.1521756878112499, + "grad_norm": 18.70437584528313, + "learning_rate": 1.4878509780476852e-07, + "logits/chosen": -1.6992989778518677, + "logits/rejected": -1.7136512994766235, + "logps/chosen": -494.5487060546875, + "logps/rejected": -466.22174072265625, + "loss": 0.6548, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1514841765165329, + "rewards/margins": 0.03264841437339783, + "rewards/rejected": -0.18413259088993073, + "step": 233 + }, + { + "epoch": 0.15282880235121235, + "grad_norm": 6.4709344020100925, + "learning_rate": 1.4875423209075598e-07, + "logits/chosen": -1.7042124271392822, + "logits/rejected": -1.6889500617980957, + "logps/chosen": -625.1463623046875, + "logps/rejected": -610.1585083007812, + "loss": 0.6638, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2791493237018585, + "rewards/margins": 0.07801420241594315, + "rewards/rejected": -0.35716351866722107, + "step": 234 + }, + { + "epoch": 0.1534819168911748, + "grad_norm": 6.93021377573884, + "learning_rate": 1.4872298247629262e-07, + "logits/chosen": -1.7167174816131592, + "logits/rejected": -1.7292131185531616, + "logps/chosen": -540.9368286132812, + "logps/rejected": -528.7276611328125, + "loss": 0.6536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21813733875751495, + "rewards/margins": 0.07377970963716507, + "rewards/rejected": -0.29191702604293823, + "step": 235 + }, + { + "epoch": 0.15413503143113724, + "grad_norm": 17.987164953454663, + "learning_rate": 1.486913491240368e-07, + "logits/chosen": -1.7713466882705688, + "logits/rejected": -1.7571359872817993, + "logps/chosen": -569.6904907226562, + "logps/rejected": -590.6726684570312, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27502891421318054, + "rewards/margins": 0.07631243020296097, + "rewards/rejected": -0.3513413667678833, + "step": 236 + }, + { + "epoch": 0.1547881459710997, + "grad_norm": 13.606732916452634, + "learning_rate": 1.4865933219864426e-07, + "logits/chosen": -1.687245488166809, + "logits/rejected": -1.6311473846435547, + "logps/chosen": -535.9907836914062, + "logps/rejected": -575.3447875976562, + "loss": 0.6569, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.24033436179161072, + "rewards/margins": 0.06762813031673431, + "rewards/rejected": -0.3079625368118286, + "step": 237 + }, + { + "epoch": 0.15544126051106213, + "grad_norm": 19.472827108008165, + "learning_rate": 1.4862693186676727e-07, + "logits/chosen": -1.6632615327835083, + "logits/rejected": -1.7188615798950195, + "logps/chosen": -570.304931640625, + "logps/rejected": -528.0740966796875, + "loss": 0.6658, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2309207022190094, + "rewards/margins": 0.0879175215959549, + "rewards/rejected": -0.3188382387161255, + "step": 238 + }, + { + "epoch": 0.15609437505102458, + "grad_norm": 22.88007198498636, + "learning_rate": 1.4859414829705384e-07, + "logits/chosen": -1.6684261560440063, + "logits/rejected": -1.7101047039031982, + "logps/chosen": -567.6971435546875, + "logps/rejected": -546.453369140625, + "loss": 0.6719, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19033075869083405, + "rewards/margins": 0.06966198980808258, + "rewards/rejected": -0.2599927484989166, + "step": 239 + }, + { + "epoch": 0.15674748959098703, + "grad_norm": 36.841939632438105, + "learning_rate": 1.4856098166014676e-07, + "logits/chosen": -1.7202922105789185, + "logits/rejected": -1.6927356719970703, + "logps/chosen": -574.1094970703125, + "logps/rejected": -533.2037963867188, + "loss": 0.6703, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.25585952401161194, + "rewards/margins": 0.0038572316989302635, + "rewards/rejected": -0.2597167491912842, + "step": 240 + }, + { + "epoch": 0.15740060413094947, + "grad_norm": 21.236611473334218, + "learning_rate": 1.4852743212868267e-07, + "logits/chosen": -1.653743028640747, + "logits/rejected": -1.6406570672988892, + "logps/chosen": -491.4779968261719, + "logps/rejected": -504.90289306640625, + "loss": 0.6619, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.26939359307289124, + "rewards/margins": -0.0027835238724946976, + "rewards/rejected": -0.2666100263595581, + "step": 241 + }, + { + "epoch": 0.15805371867091192, + "grad_norm": 22.64341132140898, + "learning_rate": 1.4849349987729134e-07, + "logits/chosen": -1.6768407821655273, + "logits/rejected": -1.707101821899414, + "logps/chosen": -508.9664306640625, + "logps/rejected": -484.92047119140625, + "loss": 0.6723, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22562530636787415, + "rewards/margins": 0.022399617359042168, + "rewards/rejected": -0.24802491068840027, + "step": 242 + }, + { + "epoch": 0.15870683321087437, + "grad_norm": 38.25143501681014, + "learning_rate": 1.4845918508259456e-07, + "logits/chosen": -1.5299750566482544, + "logits/rejected": -1.5509228706359863, + "logps/chosen": -515.6123046875, + "logps/rejected": -503.2989196777344, + "loss": 0.6704, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2918733060359955, + "rewards/margins": 0.03851493448019028, + "rewards/rejected": -0.33038821816444397, + "step": 243 + }, + { + "epoch": 0.1593599477508368, + "grad_norm": 42.26475756226102, + "learning_rate": 1.4842448792320532e-07, + "logits/chosen": -1.6829333305358887, + "logits/rejected": -1.6792709827423096, + "logps/chosen": -551.5542602539062, + "logps/rejected": -623.4794921875, + "loss": 0.6633, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.2852548360824585, + "rewards/margins": 0.11484323441982269, + "rewards/rejected": -0.4000980854034424, + "step": 244 + }, + { + "epoch": 0.16001306229079926, + "grad_norm": 6.6637868943907135, + "learning_rate": 1.4838940857972694e-07, + "logits/chosen": -1.7213988304138184, + "logits/rejected": -1.7653172016143799, + "logps/chosen": -597.0615234375, + "logps/rejected": -580.1837158203125, + "loss": 0.6619, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2760111391544342, + "rewards/margins": 0.08438215404748917, + "rewards/rejected": -0.3603932559490204, + "step": 245 + }, + { + "epoch": 0.1606661768307617, + "grad_norm": 7.466033172797767, + "learning_rate": 1.4835394723475195e-07, + "logits/chosen": -1.7462300062179565, + "logits/rejected": -1.7599581480026245, + "logps/chosen": -624.055908203125, + "logps/rejected": -556.203369140625, + "loss": 0.6582, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.32625612616539, + "rewards/margins": 0.019012071192264557, + "rewards/rejected": -0.3452681601047516, + "step": 246 + }, + { + "epoch": 0.16131929137072415, + "grad_norm": 18.36532918357786, + "learning_rate": 1.4831810407286132e-07, + "logits/chosen": -1.7185373306274414, + "logits/rejected": -1.6984236240386963, + "logps/chosen": -497.5492858886719, + "logps/rejected": -549.6527709960938, + "loss": 0.657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21095693111419678, + "rewards/margins": 0.10894811898469925, + "rewards/rejected": -0.31990501284599304, + "step": 247 + }, + { + "epoch": 0.1619724059106866, + "grad_norm": 6.712430351467606, + "learning_rate": 1.4828187928062343e-07, + "logits/chosen": -1.677220344543457, + "logits/rejected": -1.6865521669387817, + "logps/chosen": -496.3159484863281, + "logps/rejected": -491.939208984375, + "loss": 0.6605, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2620304822921753, + "rewards/margins": 0.05196765065193176, + "rewards/rejected": -0.31399816274642944, + "step": 248 + }, + { + "epoch": 0.16262552045064904, + "grad_norm": 16.60390065867787, + "learning_rate": 1.4824527304659303e-07, + "logits/chosen": -1.8142789602279663, + "logits/rejected": -1.808518648147583, + "logps/chosen": -573.7063598632812, + "logps/rejected": -586.904541015625, + "loss": 0.656, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2939229905605316, + "rewards/margins": 0.20337671041488647, + "rewards/rejected": -0.4972996711730957, + "step": 249 + }, + { + "epoch": 0.1632786349906115, + "grad_norm": 8.526623591061165, + "learning_rate": 1.4820828556131042e-07, + "logits/chosen": -1.682909607887268, + "logits/rejected": -1.6931588649749756, + "logps/chosen": -553.196044921875, + "logps/rejected": -606.1085205078125, + "loss": 0.6407, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3215971887111664, + "rewards/margins": 0.13771557807922363, + "rewards/rejected": -0.4593127965927124, + "step": 250 + }, + { + "epoch": 0.16393174953057393, + "grad_norm": 31.575444561187965, + "learning_rate": 1.4817091701730025e-07, + "logits/chosen": -1.7506418228149414, + "logits/rejected": -1.7645210027694702, + "logps/chosen": -504.40216064453125, + "logps/rejected": -466.3576965332031, + "loss": 0.6607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2621554732322693, + "rewards/margins": 0.02839628979563713, + "rewards/rejected": -0.2905517518520355, + "step": 251 + }, + { + "epoch": 0.16458486407053638, + "grad_norm": 33.198936730829594, + "learning_rate": 1.4813316760907073e-07, + "logits/chosen": -1.7571806907653809, + "logits/rejected": -1.7107410430908203, + "logps/chosen": -535.119140625, + "logps/rejected": -570.49658203125, + "loss": 0.6653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2747470736503601, + "rewards/margins": 0.10117386281490326, + "rewards/rejected": -0.37592095136642456, + "step": 252 + }, + { + "epoch": 0.16523797861049883, + "grad_norm": 8.023556183248012, + "learning_rate": 1.480950375331125e-07, + "logits/chosen": -1.724591851234436, + "logits/rejected": -1.724915623664856, + "logps/chosen": -533.9208374023438, + "logps/rejected": -578.927734375, + "loss": 0.6578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2872743010520935, + "rewards/margins": 0.1008455902338028, + "rewards/rejected": -0.3881198763847351, + "step": 253 + }, + { + "epoch": 0.16589109315046127, + "grad_norm": 17.29562845944618, + "learning_rate": 1.4805652698789758e-07, + "logits/chosen": -1.6281641721725464, + "logits/rejected": -1.6535223722457886, + "logps/chosen": -501.57318115234375, + "logps/rejected": -623.32568359375, + "loss": 0.6373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29188841581344604, + "rewards/margins": 0.24822545051574707, + "rewards/rejected": -0.5401138663291931, + "step": 254 + }, + { + "epoch": 0.16654420769042372, + "grad_norm": 12.955908095771617, + "learning_rate": 1.480176361738784e-07, + "logits/chosen": -1.6974416971206665, + "logits/rejected": -1.7031437158584595, + "logps/chosen": -536.0557250976562, + "logps/rejected": -505.6365661621094, + "loss": 0.6586, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.32581400871276855, + "rewards/margins": 0.018291521817445755, + "rewards/rejected": -0.3441055715084076, + "step": 255 + }, + { + "epoch": 0.16719732223038616, + "grad_norm": 10.398012268043638, + "learning_rate": 1.4797836529348678e-07, + "logits/chosen": -1.6877224445343018, + "logits/rejected": -1.6973309516906738, + "logps/chosen": -539.1455688476562, + "logps/rejected": -539.2821044921875, + "loss": 0.6554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30231714248657227, + "rewards/margins": 0.07891248911619186, + "rewards/rejected": -0.3812296390533447, + "step": 256 + }, + { + "epoch": 0.1678504367703486, + "grad_norm": 13.095086309931633, + "learning_rate": 1.4793871455113277e-07, + "logits/chosen": -1.7343683242797852, + "logits/rejected": -1.7324395179748535, + "logps/chosen": -509.0965576171875, + "logps/rejected": -560.6080322265625, + "loss": 0.6479, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.30541422963142395, + "rewards/margins": 0.18525291979312897, + "rewards/rejected": -0.4906671643257141, + "step": 257 + }, + { + "epoch": 0.16850355131031106, + "grad_norm": 8.776278689782488, + "learning_rate": 1.478986841532037e-07, + "logits/chosen": -1.6799153089523315, + "logits/rejected": -1.6792922019958496, + "logps/chosen": -507.31787109375, + "logps/rejected": -538.5675659179688, + "loss": 0.6616, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2834991216659546, + "rewards/margins": 0.1000271737575531, + "rewards/rejected": -0.3835262954235077, + "step": 258 + }, + { + "epoch": 0.1691566658502735, + "grad_norm": 12.19544220249078, + "learning_rate": 1.4785827430806304e-07, + "logits/chosen": -1.6284384727478027, + "logits/rejected": -1.5995004177093506, + "logps/chosen": -496.8951416015625, + "logps/rejected": -489.3295593261719, + "loss": 0.6491, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.344193696975708, + "rewards/margins": 0.09732924401760101, + "rewards/rejected": -0.4415229558944702, + "step": 259 + }, + { + "epoch": 0.16980978039023595, + "grad_norm": 34.82104290571965, + "learning_rate": 1.4781748522604932e-07, + "logits/chosen": -1.7195165157318115, + "logits/rejected": -1.7009164094924927, + "logps/chosen": -552.0281982421875, + "logps/rejected": -630.2988891601562, + "loss": 0.6517, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3577840328216553, + "rewards/margins": 0.2054346352815628, + "rewards/rejected": -0.5632186532020569, + "step": 260 + }, + { + "epoch": 0.1704628949301984, + "grad_norm": 30.979244743569765, + "learning_rate": 1.4777631711947508e-07, + "logits/chosen": -1.6629853248596191, + "logits/rejected": -1.6167453527450562, + "logps/chosen": -505.7708740234375, + "logps/rejected": -532.100830078125, + "loss": 0.6485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3221421241760254, + "rewards/margins": 0.06837349385023117, + "rewards/rejected": -0.39051565527915955, + "step": 261 + }, + { + "epoch": 0.17111600947016084, + "grad_norm": 41.43490075038442, + "learning_rate": 1.4773477020262572e-07, + "logits/chosen": -1.728777527809143, + "logits/rejected": -1.726311445236206, + "logps/chosen": -620.5463256835938, + "logps/rejected": -633.969970703125, + "loss": 0.6527, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4750226140022278, + "rewards/margins": 0.15798604488372803, + "rewards/rejected": -0.6330086588859558, + "step": 262 + }, + { + "epoch": 0.1717691240101233, + "grad_norm": 23.108934696505543, + "learning_rate": 1.4769284469175835e-07, + "logits/chosen": -1.7010533809661865, + "logits/rejected": -1.700863242149353, + "logps/chosen": -496.9283752441406, + "logps/rejected": -496.18096923828125, + "loss": 0.6404, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2904779613018036, + "rewards/margins": 0.08895879238843918, + "rewards/rejected": -0.37943676114082336, + "step": 263 + }, + { + "epoch": 0.17242223855008573, + "grad_norm": 29.393486906363, + "learning_rate": 1.476505408051008e-07, + "logits/chosen": -1.7014069557189941, + "logits/rejected": -1.785143494606018, + "logps/chosen": -577.2012939453125, + "logps/rejected": -608.8991088867188, + "loss": 0.6534, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3605682849884033, + "rewards/margins": 0.11243538558483124, + "rewards/rejected": -0.47300365567207336, + "step": 264 + }, + { + "epoch": 0.17307535309004818, + "grad_norm": 6.776931632936762, + "learning_rate": 1.476078587628503e-07, + "logits/chosen": -1.7219668626785278, + "logits/rejected": -1.7316367626190186, + "logps/chosen": -613.4144287109375, + "logps/rejected": -600.9342041015625, + "loss": 0.6394, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4629693329334259, + "rewards/margins": 0.09507772326469421, + "rewards/rejected": -0.5580470561981201, + "step": 265 + }, + { + "epoch": 0.17372846763001062, + "grad_norm": 15.838565452388137, + "learning_rate": 1.4756479878717254e-07, + "logits/chosen": -1.6845935583114624, + "logits/rejected": -1.697797417640686, + "logps/chosen": -502.57818603515625, + "logps/rejected": -501.9476318359375, + "loss": 0.6348, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3347086012363434, + "rewards/margins": 0.14087814092636108, + "rewards/rejected": -0.4755867123603821, + "step": 266 + }, + { + "epoch": 0.17438158216997307, + "grad_norm": 22.80772385019741, + "learning_rate": 1.4752136110220027e-07, + "logits/chosen": -1.689018964767456, + "logits/rejected": -1.68304443359375, + "logps/chosen": -510.1197509765625, + "logps/rejected": -546.5357666015625, + "loss": 0.6512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30356815457344055, + "rewards/margins": 0.14922784268856049, + "rewards/rejected": -0.45279598236083984, + "step": 267 + }, + { + "epoch": 0.17503469670993552, + "grad_norm": 23.730722811526665, + "learning_rate": 1.4747754593403243e-07, + "logits/chosen": -1.6035892963409424, + "logits/rejected": -1.63158118724823, + "logps/chosen": -526.643798828125, + "logps/rejected": -575.2221069335938, + "loss": 0.6536, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3458973467350006, + "rewards/margins": 0.19965365529060364, + "rewards/rejected": -0.5455510020256042, + "step": 268 + }, + { + "epoch": 0.17568781124989796, + "grad_norm": 7.0753534582972, + "learning_rate": 1.4743335351073263e-07, + "logits/chosen": -1.7131729125976562, + "logits/rejected": -1.716842532157898, + "logps/chosen": -589.8107299804688, + "logps/rejected": -586.211669921875, + "loss": 0.6484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36673662066459656, + "rewards/margins": 0.08891665935516357, + "rewards/rejected": -0.45565328001976013, + "step": 269 + }, + { + "epoch": 0.1763409257898604, + "grad_norm": 13.47915067878089, + "learning_rate": 1.4738878406232824e-07, + "logits/chosen": -1.6974538564682007, + "logits/rejected": -1.7051836252212524, + "logps/chosen": -576.85302734375, + "logps/rejected": -595.0402221679688, + "loss": 0.6588, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4583105444908142, + "rewards/margins": 0.09714774042367935, + "rewards/rejected": -0.5554581880569458, + "step": 270 + }, + { + "epoch": 0.17699404032982285, + "grad_norm": 26.554664114423755, + "learning_rate": 1.4734383782080914e-07, + "logits/chosen": -1.682262659072876, + "logits/rejected": -1.653159499168396, + "logps/chosen": -584.1964721679688, + "logps/rejected": -617.1187744140625, + "loss": 0.6637, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.42176809906959534, + "rewards/margins": 0.07791588455438614, + "rewards/rejected": -0.4996839761734009, + "step": 271 + }, + { + "epoch": 0.1776471548697853, + "grad_norm": 10.821406826613458, + "learning_rate": 1.4729851502012636e-07, + "logits/chosen": -1.711740255355835, + "logits/rejected": -1.7466825246810913, + "logps/chosen": -522.8740844726562, + "logps/rejected": -510.05743408203125, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.388105183839798, + "rewards/margins": 0.08373910188674927, + "rewards/rejected": -0.47184431552886963, + "step": 272 + }, + { + "epoch": 0.17830026940974775, + "grad_norm": 6.675378991764519, + "learning_rate": 1.4725281589619103e-07, + "logits/chosen": -1.6703485250473022, + "logits/rejected": -1.6685247421264648, + "logps/chosen": -530.2496337890625, + "logps/rejected": -535.6815795898438, + "loss": 0.6426, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.42745375633239746, + "rewards/margins": 0.18275927007198334, + "rewards/rejected": -0.6102129817008972, + "step": 273 + }, + { + "epoch": 0.1789533839497102, + "grad_norm": 9.179293872139016, + "learning_rate": 1.4720674068687308e-07, + "logits/chosen": -1.7541323900222778, + "logits/rejected": -1.811903476715088, + "logps/chosen": -548.3256225585938, + "logps/rejected": -549.988525390625, + "loss": 0.6477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4320800304412842, + "rewards/margins": 0.04719913750886917, + "rewards/rejected": -0.47927919030189514, + "step": 274 + }, + { + "epoch": 0.17960649848967264, + "grad_norm": 19.694787175563757, + "learning_rate": 1.4716028963200005e-07, + "logits/chosen": -1.692307472229004, + "logits/rejected": -1.6705561876296997, + "logps/chosen": -493.9141845703125, + "logps/rejected": -527.4869384765625, + "loss": 0.6533, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3439943492412567, + "rewards/margins": 0.14113494753837585, + "rewards/rejected": -0.48512929677963257, + "step": 275 + }, + { + "epoch": 0.18025961302963509, + "grad_norm": 20.421232540328838, + "learning_rate": 1.4711346297335575e-07, + "logits/chosen": -1.7493505477905273, + "logits/rejected": -1.7484819889068604, + "logps/chosen": -562.100341796875, + "logps/rejected": -544.1605224609375, + "loss": 0.6586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.39796724915504456, + "rewards/margins": 0.1226586103439331, + "rewards/rejected": -0.5206258296966553, + "step": 276 + }, + { + "epoch": 0.18091272756959753, + "grad_norm": 7.813830449111613, + "learning_rate": 1.4706626095467905e-07, + "logits/chosen": -1.6537861824035645, + "logits/rejected": -1.6490662097930908, + "logps/chosen": -615.1405029296875, + "logps/rejected": -640.3934326171875, + "loss": 0.6364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39063894748687744, + "rewards/margins": 0.22695013880729675, + "rewards/rejected": -0.6175890564918518, + "step": 277 + }, + { + "epoch": 0.18156584210955998, + "grad_norm": 18.137788034555644, + "learning_rate": 1.470186838216627e-07, + "logits/chosen": -1.7237962484359741, + "logits/rejected": -1.7260386943817139, + "logps/chosen": -560.9794921875, + "logps/rejected": -544.9048461914062, + "loss": 0.6549, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4301608204841614, + "rewards/margins": 0.10491985827684402, + "rewards/rejected": -0.5350806713104248, + "step": 278 + }, + { + "epoch": 0.18221895664952242, + "grad_norm": 6.203981970076561, + "learning_rate": 1.469707318219519e-07, + "logits/chosen": -1.7497467994689941, + "logits/rejected": -1.7625311613082886, + "logps/chosen": -476.921875, + "logps/rejected": -468.9197692871094, + "loss": 0.6288, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.39907845854759216, + "rewards/margins": 0.09824170172214508, + "rewards/rejected": -0.49732017517089844, + "step": 279 + }, + { + "epoch": 0.18287207118948487, + "grad_norm": 30.605405460520963, + "learning_rate": 1.4692240520514308e-07, + "logits/chosen": -1.692575216293335, + "logits/rejected": -1.7037385702133179, + "logps/chosen": -459.114990234375, + "logps/rejected": -475.6065368652344, + "loss": 0.6475, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.37590938806533813, + "rewards/margins": 0.06421739608049393, + "rewards/rejected": -0.4401267468929291, + "step": 280 + }, + { + "epoch": 0.18352518572944732, + "grad_norm": 6.890672032716978, + "learning_rate": 1.4687370422278264e-07, + "logits/chosen": -1.6996791362762451, + "logits/rejected": -1.7280397415161133, + "logps/chosen": -567.3660278320312, + "logps/rejected": -556.817138671875, + "loss": 0.6557, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4747771620750427, + "rewards/margins": 0.10796495527029037, + "rewards/rejected": -0.5827420949935913, + "step": 281 + }, + { + "epoch": 0.18417830026940976, + "grad_norm": 7.575771612722592, + "learning_rate": 1.4682462912836556e-07, + "logits/chosen": -1.6831636428833008, + "logits/rejected": -1.659497857093811, + "logps/chosen": -591.215576171875, + "logps/rejected": -606.3766479492188, + "loss": 0.6464, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5020853281021118, + "rewards/margins": 0.08574617654085159, + "rewards/rejected": -0.5878314971923828, + "step": 282 + }, + { + "epoch": 0.18483141480937218, + "grad_norm": 14.228779893218709, + "learning_rate": 1.4677518017733416e-07, + "logits/chosen": -1.620701789855957, + "logits/rejected": -1.6582701206207275, + "logps/chosen": -493.6382141113281, + "logps/rejected": -494.93731689453125, + "loss": 0.652, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4052881896495819, + "rewards/margins": 0.05283729359507561, + "rewards/rejected": -0.4581254720687866, + "step": 283 + }, + { + "epoch": 0.18548452934933463, + "grad_norm": 9.401784297751998, + "learning_rate": 1.467253576270767e-07, + "logits/chosen": -1.741771936416626, + "logits/rejected": -1.75984787940979, + "logps/chosen": -491.3598937988281, + "logps/rejected": -527.6268310546875, + "loss": 0.6431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45621737837791443, + "rewards/margins": 0.12358233332633972, + "rewards/rejected": -0.5797997713088989, + "step": 284 + }, + { + "epoch": 0.18613764388929707, + "grad_norm": 8.738330524484025, + "learning_rate": 1.466751617369261e-07, + "logits/chosen": -1.7441930770874023, + "logits/rejected": -1.8137414455413818, + "logps/chosen": -600.4554443359375, + "logps/rejected": -585.8037719726562, + "loss": 0.6408, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5598416924476624, + "rewards/margins": 0.06251634657382965, + "rewards/rejected": -0.6223580241203308, + "step": 285 + }, + { + "epoch": 0.18679075842925952, + "grad_norm": 7.921733780275398, + "learning_rate": 1.4662459276815857e-07, + "logits/chosen": -1.740809679031372, + "logits/rejected": -1.6921014785766602, + "logps/chosen": -521.9088745117188, + "logps/rejected": -613.4127807617188, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5204059481620789, + "rewards/margins": 0.23885881900787354, + "rewards/rejected": -0.7592648267745972, + "step": 286 + }, + { + "epoch": 0.18744387296922196, + "grad_norm": 14.993635596662422, + "learning_rate": 1.4657365098399217e-07, + "logits/chosen": -1.6475145816802979, + "logits/rejected": -1.6759244203567505, + "logps/chosen": -588.921142578125, + "logps/rejected": -621.540771484375, + "loss": 0.6506, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5357760787010193, + "rewards/margins": 0.17871497571468353, + "rewards/rejected": -0.714491069316864, + "step": 287 + }, + { + "epoch": 0.1880969875091844, + "grad_norm": 13.969578435371153, + "learning_rate": 1.4652233664958564e-07, + "logits/chosen": -1.6996246576309204, + "logits/rejected": -1.6887569427490234, + "logps/chosen": -524.6732177734375, + "logps/rejected": -578.5980224609375, + "loss": 0.6505, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.513081431388855, + "rewards/margins": 0.24069830775260925, + "rewards/rejected": -0.7537796497344971, + "step": 288 + }, + { + "epoch": 0.18875010204914686, + "grad_norm": 11.610438307536514, + "learning_rate": 1.4647065003203673e-07, + "logits/chosen": -1.629154086112976, + "logits/rejected": -1.675700306892395, + "logps/chosen": -522.1669921875, + "logps/rejected": -537.0850830078125, + "loss": 0.6589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5520819425582886, + "rewards/margins": 0.10711211711168289, + "rewards/rejected": -0.6591941118240356, + "step": 289 + }, + { + "epoch": 0.1894032165891093, + "grad_norm": 10.283639619093217, + "learning_rate": 1.4641859140038115e-07, + "logits/chosen": -1.7125153541564941, + "logits/rejected": -1.7900097370147705, + "logps/chosen": -481.0557861328125, + "logps/rejected": -466.9371643066406, + "loss": 0.6447, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4833194613456726, + "rewards/margins": 0.10249477624893188, + "rewards/rejected": -0.5858142375946045, + "step": 290 + }, + { + "epoch": 0.19005633112907175, + "grad_norm": 10.863781043680522, + "learning_rate": 1.4636616102559085e-07, + "logits/chosen": -1.7732642889022827, + "logits/rejected": -1.7560861110687256, + "logps/chosen": -497.37286376953125, + "logps/rejected": -492.2978515625, + "loss": 0.6447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5158191919326782, + "rewards/margins": 0.12314166873693466, + "rewards/rejected": -0.6389608383178711, + "step": 291 + }, + { + "epoch": 0.1907094456690342, + "grad_norm": 8.437201886799969, + "learning_rate": 1.4631335918057284e-07, + "logits/chosen": -1.755825161933899, + "logits/rejected": -1.7563961744308472, + "logps/chosen": -545.761962890625, + "logps/rejected": -605.7576904296875, + "loss": 0.6288, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5955057740211487, + "rewards/margins": 0.1695830523967743, + "rewards/rejected": -0.7650887966156006, + "step": 292 + }, + { + "epoch": 0.19136256020899664, + "grad_norm": 8.28332489348213, + "learning_rate": 1.4626018614016762e-07, + "logits/chosen": -1.6458687782287598, + "logits/rejected": -1.7265384197235107, + "logps/chosen": -524.3062744140625, + "logps/rejected": -535.7152709960938, + "loss": 0.6534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5473222136497498, + "rewards/margins": 0.09474081546068192, + "rewards/rejected": -0.6420629620552063, + "step": 293 + }, + { + "epoch": 0.1920156747489591, + "grad_norm": 12.121969382291516, + "learning_rate": 1.4620664218114785e-07, + "logits/chosen": -1.7038286924362183, + "logits/rejected": -1.7111061811447144, + "logps/chosen": -608.0626220703125, + "logps/rejected": -610.8870849609375, + "loss": 0.6474, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6269205212593079, + "rewards/margins": 0.192575603723526, + "rewards/rejected": -0.8194961547851562, + "step": 294 + }, + { + "epoch": 0.19266878928892153, + "grad_norm": 8.914872873902894, + "learning_rate": 1.4615272758221687e-07, + "logits/chosen": -1.6704249382019043, + "logits/rejected": -1.6967017650604248, + "logps/chosen": -571.5804443359375, + "logps/rejected": -568.3060302734375, + "loss": 0.6579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5785356163978577, + "rewards/margins": 0.08196890354156494, + "rewards/rejected": -0.6605044603347778, + "step": 295 + }, + { + "epoch": 0.19332190382888398, + "grad_norm": 7.935449781786862, + "learning_rate": 1.4609844262400722e-07, + "logits/chosen": -1.7343144416809082, + "logits/rejected": -1.7453346252441406, + "logps/chosen": -595.2299194335938, + "logps/rejected": -627.33984375, + "loss": 0.6345, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6429135799407959, + "rewards/margins": 0.16354820132255554, + "rewards/rejected": -0.8064618110656738, + "step": 296 + }, + { + "epoch": 0.19397501836884642, + "grad_norm": 7.485459727998883, + "learning_rate": 1.4604378758907928e-07, + "logits/chosen": -1.717028021812439, + "logits/rejected": -1.7182062864303589, + "logps/chosen": -603.7417602539062, + "logps/rejected": -610.2848510742188, + "loss": 0.6507, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6892397999763489, + "rewards/margins": 0.20615926384925842, + "rewards/rejected": -0.8953990936279297, + "step": 297 + }, + { + "epoch": 0.19462813290880887, + "grad_norm": 11.03240678642255, + "learning_rate": 1.459887627619196e-07, + "logits/chosen": -1.7350809574127197, + "logits/rejected": -1.7344433069229126, + "logps/chosen": -579.1885986328125, + "logps/rejected": -620.0966796875, + "loss": 0.6361, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6911602020263672, + "rewards/margins": 0.14496688544750214, + "rewards/rejected": -0.836126983165741, + "step": 298 + }, + { + "epoch": 0.19528124744877132, + "grad_norm": 8.53488849066449, + "learning_rate": 1.4593336842893963e-07, + "logits/chosen": -1.745806336402893, + "logits/rejected": -1.6635572910308838, + "logps/chosen": -607.8656616210938, + "logps/rejected": -672.4678344726562, + "loss": 0.6208, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6869965195655823, + "rewards/margins": 0.2365533858537674, + "rewards/rejected": -0.9235499501228333, + "step": 299 + }, + { + "epoch": 0.19593436198873376, + "grad_norm": 9.49489920339701, + "learning_rate": 1.458776048784742e-07, + "logits/chosen": -1.6529350280761719, + "logits/rejected": -1.6770867109298706, + "logps/chosen": -532.1659545898438, + "logps/rejected": -558.4205322265625, + "loss": 0.6306, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6781768798828125, + "rewards/margins": 0.10716618597507477, + "rewards/rejected": -0.7853431701660156, + "step": 300 + }, + { + "epoch": 0.19593436198873376, + "eval_logits/chosen": -1.7436009645462036, + "eval_logits/rejected": -1.7536306381225586, + "eval_logps/chosen": -574.737548828125, + "eval_logps/rejected": -584.3355712890625, + "eval_loss": 0.6346647143363953, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": -0.6439220309257507, + "eval_rewards/margins": 0.17703963816165924, + "eval_rewards/rejected": -0.8209616541862488, + "eval_runtime": 297.2957, + "eval_samples_per_second": 13.455, + "eval_steps_per_second": 0.841, + "step": 300 + }, + { + "epoch": 0.1965874765286962, + "grad_norm": 8.98774554330102, + "learning_rate": 1.4582147240077982e-07, + "logits/chosen": -1.7660400867462158, + "logits/rejected": -1.7696928977966309, + "logps/chosen": -551.3262939453125, + "logps/rejected": -584.052001953125, + "loss": 0.6399, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.6249464154243469, + "rewards/margins": 0.09611751139163971, + "rewards/rejected": -0.7210639715194702, + "step": 301 + }, + { + "epoch": 0.19724059106865865, + "grad_norm": 7.863471372182064, + "learning_rate": 1.4576497128803348e-07, + "logits/chosen": -1.7265328168869019, + "logits/rejected": -1.6853750944137573, + "logps/chosen": -536.3348999023438, + "logps/rejected": -628.0272827148438, + "loss": 0.625, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6308943033218384, + "rewards/margins": 0.23925556242465973, + "rewards/rejected": -0.8701498508453369, + "step": 302 + }, + { + "epoch": 0.1978937056086211, + "grad_norm": 28.45275146429454, + "learning_rate": 1.4570810183433083e-07, + "logits/chosen": -1.6958410739898682, + "logits/rejected": -1.7414443492889404, + "logps/chosen": -663.8955078125, + "logps/rejected": -632.41015625, + "loss": 0.6593, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8960934281349182, + "rewards/margins": 0.07879979908466339, + "rewards/rejected": -0.9748932123184204, + "step": 303 + }, + { + "epoch": 0.19854682014858355, + "grad_norm": 11.410487006682061, + "learning_rate": 1.4565086433568487e-07, + "logits/chosen": -1.7225995063781738, + "logits/rejected": -1.6888340711593628, + "logps/chosen": -581.2233276367188, + "logps/rejected": -675.6259155273438, + "loss": 0.6429, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6933416724205017, + "rewards/margins": 0.3561021387577057, + "rewards/rejected": -1.0494438409805298, + "step": 304 + }, + { + "epoch": 0.199199934688546, + "grad_norm": 8.270002666629233, + "learning_rate": 1.4559325909002424e-07, + "logits/chosen": -1.7664427757263184, + "logits/rejected": -1.7638676166534424, + "logps/chosen": -560.244873046875, + "logps/rejected": -626.065185546875, + "loss": 0.6395, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6132575869560242, + "rewards/margins": 0.3563953936100006, + "rewards/rejected": -0.9696530103683472, + "step": 305 + }, + { + "epoch": 0.19985304922850844, + "grad_norm": 12.608968024472125, + "learning_rate": 1.4553528639719185e-07, + "logits/chosen": -1.6360845565795898, + "logits/rejected": -1.7036012411117554, + "logps/chosen": -494.1507568359375, + "logps/rejected": -567.8990478515625, + "loss": 0.6366, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6473857760429382, + "rewards/margins": 0.21676453948020935, + "rewards/rejected": -0.8641502857208252, + "step": 306 + }, + { + "epoch": 0.20050616376847089, + "grad_norm": 17.135231726028064, + "learning_rate": 1.4547694655894313e-07, + "logits/chosen": -1.7396022081375122, + "logits/rejected": -1.6808857917785645, + "logps/chosen": -533.951416015625, + "logps/rejected": -562.3763427734375, + "loss": 0.6188, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6712486743927002, + "rewards/margins": 0.19814179837703705, + "rewards/rejected": -0.8693904876708984, + "step": 307 + }, + { + "epoch": 0.20115927830843333, + "grad_norm": 6.928796136463469, + "learning_rate": 1.454182398789446e-07, + "logits/chosen": -1.776479721069336, + "logits/rejected": -1.7864151000976562, + "logps/chosen": -642.5709228515625, + "logps/rejected": -602.6436157226562, + "loss": 0.6323, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7122976779937744, + "rewards/margins": 0.10113891959190369, + "rewards/rejected": -0.8134365677833557, + "step": 308 + }, + { + "epoch": 0.20181239284839578, + "grad_norm": 6.579505553675459, + "learning_rate": 1.4535916666277225e-07, + "logits/chosen": -1.6873722076416016, + "logits/rejected": -1.683058500289917, + "logps/chosen": -625.4521484375, + "logps/rejected": -681.3677978515625, + "loss": 0.6223, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7979632616043091, + "rewards/margins": 0.28991833329200745, + "rewards/rejected": -1.0878815650939941, + "step": 309 + }, + { + "epoch": 0.20246550738835822, + "grad_norm": 7.215238256234268, + "learning_rate": 1.4529972721790987e-07, + "logits/chosen": -1.7267826795578003, + "logits/rejected": -1.6847716569900513, + "logps/chosen": -607.7223510742188, + "logps/rejected": -689.0610961914062, + "loss": 0.5934, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6881787776947021, + "rewards/margins": 0.390371173620224, + "rewards/rejected": -1.0785499811172485, + "step": 310 + }, + { + "epoch": 0.20311862192832067, + "grad_norm": 8.708876826384516, + "learning_rate": 1.4523992185374762e-07, + "logits/chosen": -1.6836024522781372, + "logits/rejected": -1.7239104509353638, + "logps/chosen": -581.9091796875, + "logps/rejected": -602.4833374023438, + "loss": 0.6503, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6792923212051392, + "rewards/margins": 0.17103339731693268, + "rewards/rejected": -0.8503257632255554, + "step": 311 + }, + { + "epoch": 0.20377173646828312, + "grad_norm": 15.693301460696688, + "learning_rate": 1.4517975088158024e-07, + "logits/chosen": -1.6466374397277832, + "logits/rejected": -1.7001001834869385, + "logps/chosen": -522.967041015625, + "logps/rejected": -504.9639892578125, + "loss": 0.6303, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6082721948623657, + "rewards/margins": 0.13548986613750458, + "rewards/rejected": -0.7437620759010315, + "step": 312 + }, + { + "epoch": 0.20442485100824556, + "grad_norm": 9.45995269518211, + "learning_rate": 1.4511921461460552e-07, + "logits/chosen": -1.6590129137039185, + "logits/rejected": -1.7052656412124634, + "logps/chosen": -619.462158203125, + "logps/rejected": -658.5916748046875, + "loss": 0.6425, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8983247876167297, + "rewards/margins": 0.1738293468952179, + "rewards/rejected": -1.0721540451049805, + "step": 313 + }, + { + "epoch": 0.205077965548208, + "grad_norm": 8.619266517151214, + "learning_rate": 1.4505831336792268e-07, + "logits/chosen": -1.6681677103042603, + "logits/rejected": -1.6365997791290283, + "logps/chosen": -623.3759155273438, + "logps/rejected": -652.1790771484375, + "loss": 0.6061, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.817943811416626, + "rewards/margins": 0.2616656422615051, + "rewards/rejected": -1.0796093940734863, + "step": 314 + }, + { + "epoch": 0.20573108008817045, + "grad_norm": 16.305545075033027, + "learning_rate": 1.449970474585307e-07, + "logits/chosen": -1.7769198417663574, + "logits/rejected": -1.7549265623092651, + "logps/chosen": -508.7225646972656, + "logps/rejected": -541.2574462890625, + "loss": 0.6153, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8023353815078735, + "rewards/margins": 0.1428290456533432, + "rewards/rejected": -0.9451643228530884, + "step": 315 + }, + { + "epoch": 0.2063841946281329, + "grad_norm": 16.881338125964565, + "learning_rate": 1.4493541720532666e-07, + "logits/chosen": -1.7054510116577148, + "logits/rejected": -1.7318084239959717, + "logps/chosen": -577.06787109375, + "logps/rejected": -674.53857421875, + "loss": 0.5824, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7155259251594543, + "rewards/margins": 0.4451526999473572, + "rewards/rejected": -1.1606786251068115, + "step": 316 + }, + { + "epoch": 0.20703730916809535, + "grad_norm": 8.857782623326043, + "learning_rate": 1.4487342292910414e-07, + "logits/chosen": -1.6867567300796509, + "logits/rejected": -1.7076420783996582, + "logps/chosen": -634.0771484375, + "logps/rejected": -662.7860107421875, + "loss": 0.6135, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9092274904251099, + "rewards/margins": 0.23942092061042786, + "rewards/rejected": -1.1486485004425049, + "step": 317 + }, + { + "epoch": 0.2076904237080578, + "grad_norm": 32.895897939433254, + "learning_rate": 1.4481106495255145e-07, + "logits/chosen": -1.6250176429748535, + "logits/rejected": -1.6629047393798828, + "logps/chosen": -638.931640625, + "logps/rejected": -675.3734130859375, + "loss": 0.6102, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9061939120292664, + "rewards/margins": 0.18294726312160492, + "rewards/rejected": -1.0891411304473877, + "step": 318 + }, + { + "epoch": 0.20834353824802024, + "grad_norm": 35.209821508774134, + "learning_rate": 1.4474834360025005e-07, + "logits/chosen": -1.7239129543304443, + "logits/rejected": -1.7509957551956177, + "logps/chosen": -539.3623046875, + "logps/rejected": -585.4423828125, + "loss": 0.6255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7921657562255859, + "rewards/margins": 0.20182442665100098, + "rewards/rejected": -0.9939901232719421, + "step": 319 + }, + { + "epoch": 0.20899665278798268, + "grad_norm": 22.542720186905754, + "learning_rate": 1.446852591986728e-07, + "logits/chosen": -1.7275651693344116, + "logits/rejected": -1.7619481086730957, + "logps/chosen": -574.9896850585938, + "logps/rejected": -537.2247924804688, + "loss": 0.6222, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7382320761680603, + "rewards/margins": 0.09026205539703369, + "rewards/rejected": -0.8284941911697388, + "step": 320 + }, + { + "epoch": 0.20964976732794513, + "grad_norm": 11.689355163454417, + "learning_rate": 1.4462181207618226e-07, + "logits/chosen": -1.6684459447860718, + "logits/rejected": -1.664050817489624, + "logps/chosen": -612.190673828125, + "logps/rejected": -627.1231079101562, + "loss": 0.6396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9567988514900208, + "rewards/margins": 0.15958093106746674, + "rewards/rejected": -1.1163798570632935, + "step": 321 + }, + { + "epoch": 0.21030288186790758, + "grad_norm": 8.28817528253119, + "learning_rate": 1.445580025630291e-07, + "logits/chosen": -1.696157455444336, + "logits/rejected": -1.693131446838379, + "logps/chosen": -616.43017578125, + "logps/rejected": -605.6445922851562, + "loss": 0.6273, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8394571542739868, + "rewards/margins": 0.19782942533493042, + "rewards/rejected": -1.0372865200042725, + "step": 322 + }, + { + "epoch": 0.21095599640787002, + "grad_norm": 9.732936932214093, + "learning_rate": 1.444938309913501e-07, + "logits/chosen": -1.6943405866622925, + "logits/rejected": -1.7072209119796753, + "logps/chosen": -584.8599853515625, + "logps/rejected": -641.0568237304688, + "loss": 0.6046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7837256193161011, + "rewards/margins": 0.3857865333557129, + "rewards/rejected": -1.1695120334625244, + "step": 323 + }, + { + "epoch": 0.21160911094783247, + "grad_norm": 11.744162208267507, + "learning_rate": 1.444292976951668e-07, + "logits/chosen": -1.6355708837509155, + "logits/rejected": -1.627254843711853, + "logps/chosen": -610.7489013671875, + "logps/rejected": -618.13330078125, + "loss": 0.5927, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8486420512199402, + "rewards/margins": 0.18311645090579987, + "rewards/rejected": -1.0317585468292236, + "step": 324 + }, + { + "epoch": 0.21226222548779491, + "grad_norm": 8.36583130670152, + "learning_rate": 1.4436440301038337e-07, + "logits/chosen": -1.6780744791030884, + "logits/rejected": -1.681298017501831, + "logps/chosen": -618.8885498046875, + "logps/rejected": -628.7433471679688, + "loss": 0.6279, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9191569089889526, + "rewards/margins": 0.21737900376319885, + "rewards/rejected": -1.1365360021591187, + "step": 325 + }, + { + "epoch": 0.21291534002775736, + "grad_norm": 7.011876209354938, + "learning_rate": 1.4429914727478526e-07, + "logits/chosen": -1.693228840827942, + "logits/rejected": -1.6980655193328857, + "logps/chosen": -560.333984375, + "logps/rejected": -595.471435546875, + "loss": 0.6033, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7711371779441833, + "rewards/margins": 0.37155580520629883, + "rewards/rejected": -1.142693042755127, + "step": 326 + }, + { + "epoch": 0.2135684545677198, + "grad_norm": 8.197929978948176, + "learning_rate": 1.4423353082803705e-07, + "logits/chosen": -1.6932696104049683, + "logits/rejected": -1.7340316772460938, + "logps/chosen": -592.4081420898438, + "logps/rejected": -626.7181396484375, + "loss": 0.6079, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9063585996627808, + "rewards/margins": 0.23381030559539795, + "rewards/rejected": -1.1401689052581787, + "step": 327 + }, + { + "epoch": 0.21422156910768225, + "grad_norm": 21.483660249834884, + "learning_rate": 1.44167554011681e-07, + "logits/chosen": -1.6718993186950684, + "logits/rejected": -1.6670811176300049, + "logps/chosen": -570.2409057617188, + "logps/rejected": -584.1919555664062, + "loss": 0.6376, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6868249177932739, + "rewards/margins": 0.22840677201747894, + "rewards/rejected": -0.9152317643165588, + "step": 328 + }, + { + "epoch": 0.2148746836476447, + "grad_norm": 16.130012234299603, + "learning_rate": 1.4410121716913508e-07, + "logits/chosen": -1.6927289962768555, + "logits/rejected": -1.70664644241333, + "logps/chosen": -617.483642578125, + "logps/rejected": -620.2203369140625, + "loss": 0.6259, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9101148247718811, + "rewards/margins": 0.1278352588415146, + "rewards/rejected": -1.0379501581192017, + "step": 329 + }, + { + "epoch": 0.21552779818760714, + "grad_norm": 8.367739313272205, + "learning_rate": 1.4403452064569127e-07, + "logits/chosen": -1.6701712608337402, + "logits/rejected": -1.7202551364898682, + "logps/chosen": -661.21435546875, + "logps/rejected": -602.512451171875, + "loss": 0.6385, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1165893077850342, + "rewards/margins": 0.08732178062200546, + "rewards/rejected": -1.2039111852645874, + "step": 330 + }, + { + "epoch": 0.2161809127275696, + "grad_norm": 9.035063267419801, + "learning_rate": 1.439674647885137e-07, + "logits/chosen": -1.6763889789581299, + "logits/rejected": -1.6751902103424072, + "logps/chosen": -613.0372314453125, + "logps/rejected": -630.5709838867188, + "loss": 0.6134, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0509929656982422, + "rewards/margins": 0.17839497327804565, + "rewards/rejected": -1.2293879985809326, + "step": 331 + }, + { + "epoch": 0.21683402726753204, + "grad_norm": 9.470243202935096, + "learning_rate": 1.439000499466369e-07, + "logits/chosen": -1.7101950645446777, + "logits/rejected": -1.693764328956604, + "logps/chosen": -592.8642578125, + "logps/rejected": -629.7588500976562, + "loss": 0.6173, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9428937435150146, + "rewards/margins": 0.1368747353553772, + "rewards/rejected": -1.0797685384750366, + "step": 332 + }, + { + "epoch": 0.21748714180749448, + "grad_norm": 11.187409550489294, + "learning_rate": 1.4383227647096393e-07, + "logits/chosen": -1.651065707206726, + "logits/rejected": -1.6519496440887451, + "logps/chosen": -613.1717529296875, + "logps/rejected": -579.97265625, + "loss": 0.624, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9745721817016602, + "rewards/margins": 0.17566874623298645, + "rewards/rejected": -1.1502408981323242, + "step": 333 + }, + { + "epoch": 0.21814025634745693, + "grad_norm": 49.216595878368345, + "learning_rate": 1.4376414471426472e-07, + "logits/chosen": -1.7066453695297241, + "logits/rejected": -1.7159579992294312, + "logps/chosen": -645.9775390625, + "logps/rejected": -741.1705932617188, + "loss": 0.6203, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0998951196670532, + "rewards/margins": 0.478588730096817, + "rewards/rejected": -1.5784838199615479, + "step": 334 + }, + { + "epoch": 0.21879337088741938, + "grad_norm": 16.168657963184085, + "learning_rate": 1.436956550311739e-07, + "logits/chosen": -1.7134239673614502, + "logits/rejected": -1.7491358518600464, + "logps/chosen": -618.333251953125, + "logps/rejected": -704.8095092773438, + "loss": 0.613, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.038926124572754, + "rewards/margins": 0.38716238737106323, + "rewards/rejected": -1.426088571548462, + "step": 335 + }, + { + "epoch": 0.21944648542738182, + "grad_norm": 15.399404851620021, + "learning_rate": 1.4362680777818932e-07, + "logits/chosen": -1.6947367191314697, + "logits/rejected": -1.7013921737670898, + "logps/chosen": -616.1701049804688, + "logps/rejected": -646.965576171875, + "loss": 0.6476, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.009097695350647, + "rewards/margins": 0.12043975293636322, + "rewards/rejected": -1.1295373439788818, + "step": 336 + }, + { + "epoch": 0.22009959996734427, + "grad_norm": 12.377040896457764, + "learning_rate": 1.435576033136699e-07, + "logits/chosen": -1.72658109664917, + "logits/rejected": -1.7111766338348389, + "logps/chosen": -691.18701171875, + "logps/rejected": -746.902587890625, + "loss": 0.6108, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1553032398223877, + "rewards/margins": 0.3523827791213989, + "rewards/rejected": -1.507685899734497, + "step": 337 + }, + { + "epoch": 0.2207527145073067, + "grad_norm": 21.708794576351895, + "learning_rate": 1.4348804199783397e-07, + "logits/chosen": -1.7045814990997314, + "logits/rejected": -1.733116865158081, + "logps/chosen": -591.8544921875, + "logps/rejected": -598.6890869140625, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.881680428981781, + "rewards/margins": 0.18818916380405426, + "rewards/rejected": -1.0698695182800293, + "step": 338 + }, + { + "epoch": 0.22140582904726916, + "grad_norm": 12.368369376917613, + "learning_rate": 1.4341812419275735e-07, + "logits/chosen": -1.7070914506912231, + "logits/rejected": -1.7090568542480469, + "logps/chosen": -598.9154663085938, + "logps/rejected": -546.4701538085938, + "loss": 0.6502, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9902876019477844, + "rewards/margins": 0.08627556264400482, + "rewards/rejected": -1.0765631198883057, + "step": 339 + }, + { + "epoch": 0.2220589435872316, + "grad_norm": 7.332074698031059, + "learning_rate": 1.4334785026237135e-07, + "logits/chosen": -1.757274866104126, + "logits/rejected": -1.7839263677597046, + "logps/chosen": -611.2498779296875, + "logps/rejected": -621.130859375, + "loss": 0.5915, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0745751857757568, + "rewards/margins": 0.25732651352882385, + "rewards/rejected": -1.3319017887115479, + "step": 340 + }, + { + "epoch": 0.22271205812719405, + "grad_norm": 16.45318119474388, + "learning_rate": 1.43277220572461e-07, + "logits/chosen": -1.5772120952606201, + "logits/rejected": -1.5908713340759277, + "logps/chosen": -537.1170043945312, + "logps/rejected": -550.2281494140625, + "loss": 0.6306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9023160338401794, + "rewards/margins": 0.1352323591709137, + "rewards/rejected": -1.037548303604126, + "step": 341 + }, + { + "epoch": 0.2233651726671565, + "grad_norm": 37.88300298632188, + "learning_rate": 1.4320623549066308e-07, + "logits/chosen": -1.7355406284332275, + "logits/rejected": -1.7195450067520142, + "logps/chosen": -686.9090576171875, + "logps/rejected": -739.3055419921875, + "loss": 0.579, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.1977063417434692, + "rewards/margins": 0.5113601088523865, + "rewards/rejected": -1.7090665102005005, + "step": 342 + }, + { + "epoch": 0.22401828720711894, + "grad_norm": 22.776462467379197, + "learning_rate": 1.4313489538646427e-07, + "logits/chosen": -1.7235167026519775, + "logits/rejected": -1.7320938110351562, + "logps/chosen": -663.5242309570312, + "logps/rejected": -628.5629272460938, + "loss": 0.6169, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0688875913619995, + "rewards/margins": 0.16213612258434296, + "rewards/rejected": -1.2310236692428589, + "step": 343 + }, + { + "epoch": 0.2246714017470814, + "grad_norm": 8.488895496301154, + "learning_rate": 1.4306320063119916e-07, + "logits/chosen": -1.6490345001220703, + "logits/rejected": -1.6467983722686768, + "logps/chosen": -686.3357543945312, + "logps/rejected": -679.5280151367188, + "loss": 0.6362, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.28061044216156, + "rewards/margins": 0.09073764830827713, + "rewards/rejected": -1.371348261833191, + "step": 344 + }, + { + "epoch": 0.22532451628704384, + "grad_norm": 22.168028050033644, + "learning_rate": 1.4299115159804836e-07, + "logits/chosen": -1.7651727199554443, + "logits/rejected": -1.756993293762207, + "logps/chosen": -707.9656372070312, + "logps/rejected": -787.037109375, + "loss": 0.6355, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.312680959701538, + "rewards/margins": 0.39120543003082275, + "rewards/rejected": -1.7038863897323608, + "step": 345 + }, + { + "epoch": 0.22597763082700628, + "grad_norm": 7.58865714123304, + "learning_rate": 1.4291874866203655e-07, + "logits/chosen": -1.659616231918335, + "logits/rejected": -1.637772798538208, + "logps/chosen": -658.1541137695312, + "logps/rejected": -791.5831909179688, + "loss": 0.5703, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1292518377304077, + "rewards/margins": 0.6298790574073792, + "rewards/rejected": -1.759130835533142, + "step": 346 + }, + { + "epoch": 0.22663074536696873, + "grad_norm": 26.071692354910912, + "learning_rate": 1.428459922000305e-07, + "logits/chosen": -1.5885182619094849, + "logits/rejected": -1.6136213541030884, + "logps/chosen": -555.8087158203125, + "logps/rejected": -621.08349609375, + "loss": 0.5983, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9678431749343872, + "rewards/margins": 0.3450620472431183, + "rewards/rejected": -1.312905192375183, + "step": 347 + }, + { + "epoch": 0.22728385990693117, + "grad_norm": 7.8840447097778545, + "learning_rate": 1.4277288259073708e-07, + "logits/chosen": -1.568705439567566, + "logits/rejected": -1.5865546464920044, + "logps/chosen": -589.7760009765625, + "logps/rejected": -573.239990234375, + "loss": 0.6277, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1354511976242065, + "rewards/margins": 0.18996107578277588, + "rewards/rejected": -1.3254122734069824, + "step": 348 + }, + { + "epoch": 0.22793697444689362, + "grad_norm": 10.92884715777762, + "learning_rate": 1.4269942021470148e-07, + "logits/chosen": -1.6641845703125, + "logits/rejected": -1.6769914627075195, + "logps/chosen": -672.136474609375, + "logps/rejected": -664.6466674804688, + "loss": 0.6121, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0376440286636353, + "rewards/margins": 0.1383248120546341, + "rewards/rejected": -1.1759687662124634, + "step": 349 + }, + { + "epoch": 0.22859008898685607, + "grad_norm": 31.26872455787033, + "learning_rate": 1.4262560545430495e-07, + "logits/chosen": -1.5913841724395752, + "logits/rejected": -1.6360561847686768, + "logps/chosen": -617.2501831054688, + "logps/rejected": -550.8568115234375, + "loss": 0.6164, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.979924738407135, + "rewards/margins": 0.24141384661197662, + "rewards/rejected": -1.2213386297225952, + "step": 350 + }, + { + "epoch": 0.2292432035268185, + "grad_norm": 8.368119693346522, + "learning_rate": 1.4255143869376301e-07, + "logits/chosen": -1.7337816953659058, + "logits/rejected": -1.6971759796142578, + "logps/chosen": -652.0234375, + "logps/rejected": -777.8805541992188, + "loss": 0.5913, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1779980659484863, + "rewards/margins": 0.6400733590126038, + "rewards/rejected": -1.8180711269378662, + "step": 351 + }, + { + "epoch": 0.22989631806678096, + "grad_norm": 7.04954036164927, + "learning_rate": 1.424769203191234e-07, + "logits/chosen": -1.706908941268921, + "logits/rejected": -1.7061164379119873, + "logps/chosen": -648.4498291015625, + "logps/rejected": -606.88623046875, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2371093034744263, + "rewards/margins": 0.11428666114807129, + "rewards/rejected": -1.3513959646224976, + "step": 352 + }, + { + "epoch": 0.2305494326067434, + "grad_norm": 24.647751119336064, + "learning_rate": 1.42402050718264e-07, + "logits/chosen": -1.6741199493408203, + "logits/rejected": -1.743821620941162, + "logps/chosen": -677.9364624023438, + "logps/rejected": -710.4935302734375, + "loss": 0.5985, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0869948863983154, + "rewards/margins": 0.1914815604686737, + "rewards/rejected": -1.278476357460022, + "step": 353 + }, + { + "epoch": 0.23120254714670585, + "grad_norm": 32.99725275222979, + "learning_rate": 1.4232683028089092e-07, + "logits/chosen": -1.6567903757095337, + "logits/rejected": -1.6555206775665283, + "logps/chosen": -587.80029296875, + "logps/rejected": -598.2508544921875, + "loss": 0.6307, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.024768352508545, + "rewards/margins": 0.3201058804988861, + "rewards/rejected": -1.3448742628097534, + "step": 354 + }, + { + "epoch": 0.2318556616866683, + "grad_norm": 15.301036178500334, + "learning_rate": 1.4225125939853637e-07, + "logits/chosen": -1.5989353656768799, + "logits/rejected": -1.6277306079864502, + "logps/chosen": -566.171875, + "logps/rejected": -590.93505859375, + "loss": 0.6273, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9104868173599243, + "rewards/margins": 0.22141122817993164, + "rewards/rejected": -1.1318979263305664, + "step": 355 + }, + { + "epoch": 0.23250877622663074, + "grad_norm": 8.150144656099924, + "learning_rate": 1.4217533846455675e-07, + "logits/chosen": -1.6726055145263672, + "logits/rejected": -1.6693233251571655, + "logps/chosen": -643.8858642578125, + "logps/rejected": -628.0254516601562, + "loss": 0.6466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2304469347000122, + "rewards/margins": 0.19707579910755157, + "rewards/rejected": -1.427522897720337, + "step": 356 + }, + { + "epoch": 0.2331618907665932, + "grad_norm": 33.25010241380852, + "learning_rate": 1.4209906787413047e-07, + "logits/chosen": -1.6271083354949951, + "logits/rejected": -1.6530542373657227, + "logps/chosen": -599.4658813476562, + "logps/rejected": -608.0745849609375, + "loss": 0.5938, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1141527891159058, + "rewards/margins": 0.19912773370742798, + "rewards/rejected": -1.3132805824279785, + "step": 357 + }, + { + "epoch": 0.23381500530655563, + "grad_norm": 14.540966438844277, + "learning_rate": 1.420224480242559e-07, + "logits/chosen": -1.5631732940673828, + "logits/rejected": -1.582556128501892, + "logps/chosen": -584.55712890625, + "logps/rejected": -616.7429809570312, + "loss": 0.6118, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0560500621795654, + "rewards/margins": 0.2983832359313965, + "rewards/rejected": -1.354433298110962, + "step": 358 + }, + { + "epoch": 0.23446811984651808, + "grad_norm": 11.291749985668568, + "learning_rate": 1.4194547931374948e-07, + "logits/chosen": -1.6868391036987305, + "logits/rejected": -1.7105621099472046, + "logps/chosen": -600.7924194335938, + "logps/rejected": -677.1724853515625, + "loss": 0.6779, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.23578941822052, + "rewards/margins": 0.33331653475761414, + "rewards/rejected": -1.569106101989746, + "step": 359 + }, + { + "epoch": 0.23512123438648053, + "grad_norm": 22.901901301811037, + "learning_rate": 1.418681621432434e-07, + "logits/chosen": -1.574560523033142, + "logits/rejected": -1.599095106124878, + "logps/chosen": -599.9710693359375, + "logps/rejected": -652.0952758789062, + "loss": 0.6254, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1917017698287964, + "rewards/margins": 0.3461110591888428, + "rewards/rejected": -1.5378127098083496, + "step": 360 + }, + { + "epoch": 0.23577434892644297, + "grad_norm": 25.59056348020549, + "learning_rate": 1.417904969151837e-07, + "logits/chosen": -1.7425224781036377, + "logits/rejected": -1.7415771484375, + "logps/chosen": -704.6986694335938, + "logps/rejected": -693.380126953125, + "loss": 0.6119, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2282774448394775, + "rewards/margins": 0.26026061177253723, + "rewards/rejected": -1.488538146018982, + "step": 361 + }, + { + "epoch": 0.23642746346640542, + "grad_norm": 8.463533236278087, + "learning_rate": 1.4171248403382806e-07, + "logits/chosen": -1.6078510284423828, + "logits/rejected": -1.700717568397522, + "logps/chosen": -629.0587768554688, + "logps/rejected": -598.7468872070312, + "loss": 0.6081, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0717506408691406, + "rewards/margins": 0.29170456528663635, + "rewards/rejected": -1.3634551763534546, + "step": 362 + }, + { + "epoch": 0.23708057800636786, + "grad_norm": 27.979305395002925, + "learning_rate": 1.4163412390524378e-07, + "logits/chosen": -1.6555612087249756, + "logits/rejected": -1.701103925704956, + "logps/chosen": -515.6534423828125, + "logps/rejected": -615.6427001953125, + "loss": 0.5931, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1157307624816895, + "rewards/margins": 0.35381343960762024, + "rewards/rejected": -1.4695440530776978, + "step": 363 + }, + { + "epoch": 0.2377336925463303, + "grad_norm": 29.822643828567546, + "learning_rate": 1.4155541693730556e-07, + "logits/chosen": -1.6813297271728516, + "logits/rejected": -1.6906770467758179, + "logps/chosen": -571.6758422851562, + "logps/rejected": -599.9418334960938, + "loss": 0.5808, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0341238975524902, + "rewards/margins": 0.39114344120025635, + "rewards/rejected": -1.425267219543457, + "step": 364 + }, + { + "epoch": 0.23838680708629276, + "grad_norm": 8.151382967050962, + "learning_rate": 1.414763635396935e-07, + "logits/chosen": -1.6014394760131836, + "logits/rejected": -1.5656381845474243, + "logps/chosen": -540.3904418945312, + "logps/rejected": -593.1724853515625, + "loss": 0.6379, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1642670631408691, + "rewards/margins": 0.3421732187271118, + "rewards/rejected": -1.506440281867981, + "step": 365 + }, + { + "epoch": 0.2390399216262552, + "grad_norm": 57.74463360143876, + "learning_rate": 1.4139696412389096e-07, + "logits/chosen": -1.7312068939208984, + "logits/rejected": -1.7102075815200806, + "logps/chosen": -643.19091796875, + "logps/rejected": -672.47509765625, + "loss": 0.5735, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1425721645355225, + "rewards/margins": 0.4072319269180298, + "rewards/rejected": -1.5498042106628418, + "step": 366 + }, + { + "epoch": 0.23969303616621765, + "grad_norm": 12.197736908947663, + "learning_rate": 1.4131721910318227e-07, + "logits/chosen": -1.8108876943588257, + "logits/rejected": -1.7958606481552124, + "logps/chosen": -733.1993408203125, + "logps/rejected": -763.6776733398438, + "loss": 0.5912, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.482933759689331, + "rewards/margins": 0.2555669844150543, + "rewards/rejected": -1.7385008335113525, + "step": 367 + }, + { + "epoch": 0.2403461507061801, + "grad_norm": 19.29177394775571, + "learning_rate": 1.4123712889265072e-07, + "logits/chosen": -1.6916940212249756, + "logits/rejected": -1.7217150926589966, + "logps/chosen": -671.9423217773438, + "logps/rejected": -675.45361328125, + "loss": 0.5719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3399765491485596, + "rewards/margins": 0.20047269761562347, + "rewards/rejected": -1.5404491424560547, + "step": 368 + }, + { + "epoch": 0.24099926524614254, + "grad_norm": 25.260138658321655, + "learning_rate": 1.4115669390917636e-07, + "logits/chosen": -1.6250600814819336, + "logits/rejected": -1.6512022018432617, + "logps/chosen": -625.4688720703125, + "logps/rejected": -614.0571899414062, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2662508487701416, + "rewards/margins": 0.20612508058547974, + "rewards/rejected": -1.4723761081695557, + "step": 369 + }, + { + "epoch": 0.241652379786105, + "grad_norm": 9.803497546984937, + "learning_rate": 1.4107591457143383e-07, + "logits/chosen": -1.5895332098007202, + "logits/rejected": -1.602332592010498, + "logps/chosen": -675.6551513671875, + "logps/rejected": -709.5709838867188, + "loss": 0.5924, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2006454467773438, + "rewards/margins": 0.3013772666454315, + "rewards/rejected": -1.5020227432250977, + "step": 370 + }, + { + "epoch": 0.24230549432606743, + "grad_norm": 8.545956638134795, + "learning_rate": 1.409947912998902e-07, + "logits/chosen": -1.6708649396896362, + "logits/rejected": -1.713352918624878, + "logps/chosen": -680.72607421875, + "logps/rejected": -674.447265625, + "loss": 0.6127, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2759678363800049, + "rewards/margins": 0.18677686154842377, + "rewards/rejected": -1.4627448320388794, + "step": 371 + }, + { + "epoch": 0.24295860886602988, + "grad_norm": 19.844846501689318, + "learning_rate": 1.4091332451680267e-07, + "logits/chosen": -1.7042357921600342, + "logits/rejected": -1.7265393733978271, + "logps/chosen": -675.3814086914062, + "logps/rejected": -644.4542846679688, + "loss": 0.6427, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4431321620941162, + "rewards/margins": 0.07462039589881897, + "rewards/rejected": -1.5177525281906128, + "step": 372 + }, + { + "epoch": 0.24361172340599233, + "grad_norm": 7.920082953904519, + "learning_rate": 1.408315146462166e-07, + "logits/chosen": -1.721234679222107, + "logits/rejected": -1.701398253440857, + "logps/chosen": -589.4611206054688, + "logps/rejected": -710.4976806640625, + "loss": 0.581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1915465593338013, + "rewards/margins": 0.42795515060424805, + "rewards/rejected": -1.6195017099380493, + "step": 373 + }, + { + "epoch": 0.24426483794595477, + "grad_norm": 12.858984927661352, + "learning_rate": 1.407493621139631e-07, + "logits/chosen": -1.713492751121521, + "logits/rejected": -1.7290925979614258, + "logps/chosen": -641.5909423828125, + "logps/rejected": -650.439697265625, + "loss": 0.5713, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2154250144958496, + "rewards/margins": 0.2901778817176819, + "rewards/rejected": -1.5056030750274658, + "step": 374 + }, + { + "epoch": 0.24491795248591722, + "grad_norm": 15.690657976850302, + "learning_rate": 1.406668673476568e-07, + "logits/chosen": -1.6640945672988892, + "logits/rejected": -1.6718759536743164, + "logps/chosen": -649.737060546875, + "logps/rejected": -686.3532104492188, + "loss": 0.568, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3824304342269897, + "rewards/margins": 0.3402037024497986, + "rewards/rejected": -1.722634196281433, + "step": 375 + }, + { + "epoch": 0.24557106702587966, + "grad_norm": 25.733728918193314, + "learning_rate": 1.4058403077669386e-07, + "logits/chosen": -1.671245813369751, + "logits/rejected": -1.6539918184280396, + "logps/chosen": -620.85791015625, + "logps/rejected": -613.2889404296875, + "loss": 0.5943, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2888745069503784, + "rewards/margins": 0.22225916385650635, + "rewards/rejected": -1.5111336708068848, + "step": 376 + }, + { + "epoch": 0.2462241815658421, + "grad_norm": 9.763117553021695, + "learning_rate": 1.4050085283224946e-07, + "logits/chosen": -1.5626500844955444, + "logits/rejected": -1.57936429977417, + "logps/chosen": -653.4628295898438, + "logps/rejected": -741.0684814453125, + "loss": 0.6487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4431822299957275, + "rewards/margins": 0.3858519494533539, + "rewards/rejected": -1.8290340900421143, + "step": 377 + }, + { + "epoch": 0.24687729610580456, + "grad_norm": 14.715453079588329, + "learning_rate": 1.4041733394727567e-07, + "logits/chosen": -1.6787821054458618, + "logits/rejected": -1.675290584564209, + "logps/chosen": -656.50830078125, + "logps/rejected": -641.9553833007812, + "loss": 0.5851, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2514023780822754, + "rewards/margins": 0.16022822260856628, + "rewards/rejected": -1.411630630493164, + "step": 378 + }, + { + "epoch": 0.247530410645767, + "grad_norm": 17.584914736775964, + "learning_rate": 1.403334745564993e-07, + "logits/chosen": -1.6352498531341553, + "logits/rejected": -1.6839510202407837, + "logps/chosen": -743.545654296875, + "logps/rejected": -792.6776123046875, + "loss": 0.587, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.677843689918518, + "rewards/margins": 0.3219238221645355, + "rewards/rejected": -1.999767541885376, + "step": 379 + }, + { + "epoch": 0.24818352518572945, + "grad_norm": 35.19811670438595, + "learning_rate": 1.4024927509641947e-07, + "logits/chosen": -1.7035257816314697, + "logits/rejected": -1.6846370697021484, + "logps/chosen": -731.2345581054688, + "logps/rejected": -747.091064453125, + "loss": 0.5723, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.520798921585083, + "rewards/margins": 0.359272301197052, + "rewards/rejected": -1.8800714015960693, + "step": 380 + }, + { + "epoch": 0.2488366397256919, + "grad_norm": 51.54152025955344, + "learning_rate": 1.401647360053054e-07, + "logits/chosen": -1.6718703508377075, + "logits/rejected": -1.7071422338485718, + "logps/chosen": -684.9619140625, + "logps/rejected": -685.8736572265625, + "loss": 0.6127, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1913833618164062, + "rewards/margins": 0.4190147817134857, + "rewards/rejected": -1.6103980541229248, + "step": 381 + }, + { + "epoch": 0.24948975426565434, + "grad_norm": 29.30838787108767, + "learning_rate": 1.4007985772319414e-07, + "logits/chosen": -1.58506441116333, + "logits/rejected": -1.576029896736145, + "logps/chosen": -592.5045166015625, + "logps/rejected": -606.4868774414062, + "loss": 0.57, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3507953882217407, + "rewards/margins": 0.4049099385738373, + "rewards/rejected": -1.7557053565979004, + "step": 382 + }, + { + "epoch": 0.25014286880561676, + "grad_norm": 30.296371441246823, + "learning_rate": 1.3999464069188827e-07, + "logits/chosen": -1.511520504951477, + "logits/rejected": -1.5346118211746216, + "logps/chosen": -610.6934814453125, + "logps/rejected": -583.274658203125, + "loss": 0.5848, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2899129390716553, + "rewards/margins": 0.24026429653167725, + "rewards/rejected": -1.5301774740219116, + "step": 383 + }, + { + "epoch": 0.25079598334557923, + "grad_norm": 22.072613902976816, + "learning_rate": 1.3990908535495366e-07, + "logits/chosen": -1.6665383577346802, + "logits/rejected": -1.7260501384735107, + "logps/chosen": -649.5576782226562, + "logps/rejected": -632.3207397460938, + "loss": 0.5902, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3579076528549194, + "rewards/margins": 0.13580122590065002, + "rewards/rejected": -1.493708848953247, + "step": 384 + }, + { + "epoch": 0.25144909788554165, + "grad_norm": 11.159048199299994, + "learning_rate": 1.39823192157717e-07, + "logits/chosen": -1.7021327018737793, + "logits/rejected": -1.7216562032699585, + "logps/chosen": -702.6673583984375, + "logps/rejected": -677.152587890625, + "loss": 0.6105, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3912357091903687, + "rewards/margins": 0.28019946813583374, + "rewards/rejected": -1.6714351177215576, + "step": 385 + }, + { + "epoch": 0.2521022124255041, + "grad_norm": 12.67256793359313, + "learning_rate": 1.3973696154726372e-07, + "logits/chosen": -1.652302622795105, + "logits/rejected": -1.6712439060211182, + "logps/chosen": -615.6753540039062, + "logps/rejected": -612.9768676757812, + "loss": 0.535, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.381615400314331, + "rewards/margins": 0.35143721103668213, + "rewards/rejected": -1.7330526113510132, + "step": 386 + }, + { + "epoch": 0.25275532696546654, + "grad_norm": 9.156751796221311, + "learning_rate": 1.396503939724354e-07, + "logits/chosen": -1.687638521194458, + "logits/rejected": -1.6907660961151123, + "logps/chosen": -679.770263671875, + "logps/rejected": -704.907470703125, + "loss": 0.581, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3277437686920166, + "rewards/margins": 0.3244020938873291, + "rewards/rejected": -1.6521457433700562, + "step": 387 + }, + { + "epoch": 0.253408441505429, + "grad_norm": 26.505065205334525, + "learning_rate": 1.3956348988382757e-07, + "logits/chosen": -1.621063470840454, + "logits/rejected": -1.6041771173477173, + "logps/chosen": -596.8775634765625, + "logps/rejected": -675.8204345703125, + "loss": 0.5536, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4424089193344116, + "rewards/margins": 0.5358285903930664, + "rewards/rejected": -1.9782376289367676, + "step": 388 + }, + { + "epoch": 0.25406155604539143, + "grad_norm": 7.262480595814242, + "learning_rate": 1.394762497337875e-07, + "logits/chosen": -1.6179628372192383, + "logits/rejected": -1.6096928119659424, + "logps/chosen": -653.6219482421875, + "logps/rejected": -699.051513671875, + "loss": 0.6154, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4057137966156006, + "rewards/margins": 0.2138241082429886, + "rewards/rejected": -1.6195377111434937, + "step": 389 + }, + { + "epoch": 0.2547146705853539, + "grad_norm": 17.339973588325055, + "learning_rate": 1.393886739764116e-07, + "logits/chosen": -1.6001747846603394, + "logits/rejected": -1.6313351392745972, + "logps/chosen": -626.1659545898438, + "logps/rejected": -644.4179077148438, + "loss": 0.5469, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3461098670959473, + "rewards/margins": 0.41640228033065796, + "rewards/rejected": -1.76251220703125, + "step": 390 + }, + { + "epoch": 0.2553677851253163, + "grad_norm": 14.339393954095954, + "learning_rate": 1.3930076306754315e-07, + "logits/chosen": -1.5895658731460571, + "logits/rejected": -1.6347410678863525, + "logps/chosen": -611.0591430664062, + "logps/rejected": -595.1077880859375, + "loss": 0.5883, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2730367183685303, + "rewards/margins": 0.2380126416683197, + "rewards/rejected": -1.5110492706298828, + "step": 391 + }, + { + "epoch": 0.2560208996652788, + "grad_norm": 10.18368249263564, + "learning_rate": 1.3921251746476998e-07, + "logits/chosen": -1.7359168529510498, + "logits/rejected": -1.7397726774215698, + "logps/chosen": -588.6852416992188, + "logps/rejected": -697.5007934570312, + "loss": 0.5732, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3043936491012573, + "rewards/margins": 0.6203251481056213, + "rewards/rejected": -1.9247188568115234, + "step": 392 + }, + { + "epoch": 0.2566740142052412, + "grad_norm": 18.38247483748203, + "learning_rate": 1.39123937627422e-07, + "logits/chosen": -1.6276944875717163, + "logits/rejected": -1.5889580249786377, + "logps/chosen": -620.385498046875, + "logps/rejected": -635.3296508789062, + "loss": 0.6444, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3040493726730347, + "rewards/margins": 0.10177471488714218, + "rewards/rejected": -1.4058241844177246, + "step": 393 + }, + { + "epoch": 0.2573271287452037, + "grad_norm": 36.73565911642949, + "learning_rate": 1.390350240165689e-07, + "logits/chosen": -1.6098779439926147, + "logits/rejected": -1.7416176795959473, + "logps/chosen": -642.3756713867188, + "logps/rejected": -643.9348754882812, + "loss": 0.5877, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.407647728919983, + "rewards/margins": 0.2774192690849304, + "rewards/rejected": -1.685067057609558, + "step": 394 + }, + { + "epoch": 0.2579802432851661, + "grad_norm": 16.023490486119584, + "learning_rate": 1.3894577709501766e-07, + "logits/chosen": -1.692483901977539, + "logits/rejected": -1.7297693490982056, + "logps/chosen": -797.02880859375, + "logps/rejected": -657.3602294921875, + "loss": 0.7438, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8514511585235596, + "rewards/margins": -0.006901424378156662, + "rewards/rejected": -1.8445496559143066, + "step": 395 + }, + { + "epoch": 0.2586333578251286, + "grad_norm": 16.55592977846572, + "learning_rate": 1.3885619732731024e-07, + "logits/chosen": -1.6773359775543213, + "logits/rejected": -1.7305378913879395, + "logps/chosen": -627.8666381835938, + "logps/rejected": -695.2501220703125, + "loss": 0.5468, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4438410997390747, + "rewards/margins": 0.509320855140686, + "rewards/rejected": -1.9531618356704712, + "step": 396 + }, + { + "epoch": 0.259286472365091, + "grad_norm": 31.880923432118284, + "learning_rate": 1.3876628517972106e-07, + "logits/chosen": -1.5321552753448486, + "logits/rejected": -1.52942955493927, + "logps/chosen": -627.0950317382812, + "logps/rejected": -674.76025390625, + "loss": 0.5836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.53360915184021, + "rewards/margins": 0.2759949266910553, + "rewards/rejected": -1.8096040487289429, + "step": 397 + }, + { + "epoch": 0.2599395869050535, + "grad_norm": 13.538949029727982, + "learning_rate": 1.3867604112025465e-07, + "logits/chosen": -1.6481291055679321, + "logits/rejected": -1.6133708953857422, + "logps/chosen": -633.3253173828125, + "logps/rejected": -659.7344970703125, + "loss": 0.6281, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4847848415374756, + "rewards/margins": 0.1952800452709198, + "rewards/rejected": -1.6800646781921387, + "step": 398 + }, + { + "epoch": 0.2605927014450159, + "grad_norm": 41.823948195416314, + "learning_rate": 1.3858546561864315e-07, + "logits/chosen": -1.6834887266159058, + "logits/rejected": -1.668868064880371, + "logps/chosen": -664.4490966796875, + "logps/rejected": -718.4891357421875, + "loss": 0.5855, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5809217691421509, + "rewards/margins": 0.4111039936542511, + "rewards/rejected": -1.9920259714126587, + "step": 399 + }, + { + "epoch": 0.26124581598497837, + "grad_norm": 7.562874624386567, + "learning_rate": 1.3849455914634399e-07, + "logits/chosen": -1.6510210037231445, + "logits/rejected": -1.6854655742645264, + "logps/chosen": -694.1316528320312, + "logps/rejected": -653.6522216796875, + "loss": 0.5831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8670495748519897, + "rewards/margins": 0.05141986906528473, + "rewards/rejected": -1.9184695482254028, + "step": 400 + }, + { + "epoch": 0.26124581598497837, + "eval_logits/chosen": -1.6877070665359497, + "eval_logits/rejected": -1.6963342428207397, + "eval_logps/chosen": -661.8920288085938, + "eval_logps/rejected": -689.978759765625, + "eval_loss": 0.5931771397590637, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": -1.515466332435608, + "eval_rewards/margins": 0.36192643642425537, + "eval_rewards/rejected": -1.8773927688598633, + "eval_runtime": 300.4429, + "eval_samples_per_second": 13.314, + "eval_steps_per_second": 0.832, + "step": 400 + }, + { + "epoch": 0.2618989305249408, + "grad_norm": 9.182566410300494, + "learning_rate": 1.3840332217653723e-07, + "logits/chosen": -1.673874020576477, + "logits/rejected": -1.680929183959961, + "logps/chosen": -692.4191284179688, + "logps/rejected": -704.0576171875, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.688931941986084, + "rewards/margins": 0.1834324449300766, + "rewards/rejected": -1.8723644018173218, + "step": 401 + }, + { + "epoch": 0.26255204506490326, + "grad_norm": 8.101445637974606, + "learning_rate": 1.3831175518412327e-07, + "logits/chosen": -1.6620182991027832, + "logits/rejected": -1.6938483715057373, + "logps/chosen": -745.3438720703125, + "logps/rejected": -760.4389038085938, + "loss": 0.6006, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6187502145767212, + "rewards/margins": 0.23492467403411865, + "rewards/rejected": -1.8536747694015503, + "step": 402 + }, + { + "epoch": 0.2632051596048657, + "grad_norm": 10.95532049184201, + "learning_rate": 1.3821985864572028e-07, + "logits/chosen": -1.6001207828521729, + "logits/rejected": -1.565683126449585, + "logps/chosen": -678.5340576171875, + "logps/rejected": -816.8855590820312, + "loss": 0.6223, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7041336297988892, + "rewards/margins": 0.5798905491828918, + "rewards/rejected": -2.284024238586426, + "step": 403 + }, + { + "epoch": 0.26385827414482815, + "grad_norm": 19.544045940040125, + "learning_rate": 1.3812763303966186e-07, + "logits/chosen": -1.6810550689697266, + "logits/rejected": -1.6656105518341064, + "logps/chosen": -635.9982299804688, + "logps/rejected": -654.9970703125, + "loss": 0.5854, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5096485614776611, + "rewards/margins": 0.24476324021816254, + "rewards/rejected": -1.7544115781784058, + "step": 404 + }, + { + "epoch": 0.26451138868479057, + "grad_norm": 8.792026250775443, + "learning_rate": 1.3803507884599438e-07, + "logits/chosen": -1.693314552307129, + "logits/rejected": -1.678006887435913, + "logps/chosen": -794.29443359375, + "logps/rejected": -808.3348388671875, + "loss": 0.5861, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7267074584960938, + "rewards/margins": 0.24482649564743042, + "rewards/rejected": -1.9715338945388794, + "step": 405 + }, + { + "epoch": 0.26516450322475305, + "grad_norm": 39.2544432402473, + "learning_rate": 1.379421965464745e-07, + "logits/chosen": -1.672491431236267, + "logits/rejected": -1.6746456623077393, + "logps/chosen": -739.5106811523438, + "logps/rejected": -728.4205322265625, + "loss": 0.611, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7484608888626099, + "rewards/margins": 0.24220341444015503, + "rewards/rejected": -1.9906642436981201, + "step": 406 + }, + { + "epoch": 0.26581761776471546, + "grad_norm": 61.06487102613164, + "learning_rate": 1.378489866245668e-07, + "logits/chosen": -1.6239569187164307, + "logits/rejected": -1.6054996252059937, + "logps/chosen": -597.54833984375, + "logps/rejected": -692.744384765625, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.441511631011963, + "rewards/margins": 0.5937218070030212, + "rewards/rejected": -2.035233497619629, + "step": 407 + }, + { + "epoch": 0.26647073230467794, + "grad_norm": 9.653358684629563, + "learning_rate": 1.3775544956544115e-07, + "logits/chosen": -1.6666955947875977, + "logits/rejected": -1.6608633995056152, + "logps/chosen": -721.04541015625, + "logps/rejected": -931.44482421875, + "loss": 0.519, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6330653429031372, + "rewards/margins": 1.069937825202942, + "rewards/rejected": -2.7030029296875, + "step": 408 + }, + { + "epoch": 0.26712384684464036, + "grad_norm": 37.20492293139169, + "learning_rate": 1.3766158585597024e-07, + "logits/chosen": -1.747741460800171, + "logits/rejected": -1.681113600730896, + "logps/chosen": -684.744384765625, + "logps/rejected": -716.8374633789062, + "loss": 0.5827, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5082005262374878, + "rewards/margins": 0.3761386573314667, + "rewards/rejected": -1.8843392133712769, + "step": 409 + }, + { + "epoch": 0.26777696138460283, + "grad_norm": 14.02260372554619, + "learning_rate": 1.3756739598472692e-07, + "logits/chosen": -1.6259886026382446, + "logits/rejected": -1.6599675416946411, + "logps/chosen": -740.4105834960938, + "logps/rejected": -777.7703247070312, + "loss": 0.5891, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6748157739639282, + "rewards/margins": 0.48535382747650146, + "rewards/rejected": -2.1601696014404297, + "step": 410 + }, + { + "epoch": 0.26843007592456525, + "grad_norm": 11.140097310760751, + "learning_rate": 1.3747288044198186e-07, + "logits/chosen": -1.692323088645935, + "logits/rejected": -1.705490231513977, + "logps/chosen": -676.2076416015625, + "logps/rejected": -718.6063232421875, + "loss": 0.6076, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5469157695770264, + "rewards/margins": 0.4593140482902527, + "rewards/rejected": -2.006229877471924, + "step": 411 + }, + { + "epoch": 0.2690831904645277, + "grad_norm": 21.86521082932002, + "learning_rate": 1.373780397197009e-07, + "logits/chosen": -1.5999939441680908, + "logits/rejected": -1.638975739479065, + "logps/chosen": -697.4719848632812, + "logps/rejected": -822.9923095703125, + "loss": 0.5562, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7497361898422241, + "rewards/margins": 0.5286187529563904, + "rewards/rejected": -2.278355121612549, + "step": 412 + }, + { + "epoch": 0.26973630500449014, + "grad_norm": 85.11109239770148, + "learning_rate": 1.3728287431154236e-07, + "logits/chosen": -1.65316903591156, + "logits/rejected": -1.6552778482437134, + "logps/chosen": -639.5845947265625, + "logps/rejected": -646.5022583007812, + "loss": 0.6713, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5560939311981201, + "rewards/margins": 0.2231585830450058, + "rewards/rejected": -1.779252529144287, + "step": 413 + }, + { + "epoch": 0.2703894195444526, + "grad_norm": 27.726328984830257, + "learning_rate": 1.371873847128547e-07, + "logits/chosen": -1.7042299509048462, + "logits/rejected": -1.664905309677124, + "logps/chosen": -594.562744140625, + "logps/rejected": -746.6199951171875, + "loss": 0.5604, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5040781497955322, + "rewards/margins": 0.6467331647872925, + "rewards/rejected": -2.1508116722106934, + "step": 414 + }, + { + "epoch": 0.27104253408441503, + "grad_norm": 7.783183095188105, + "learning_rate": 1.3709157142067383e-07, + "logits/chosen": -1.530868649482727, + "logits/rejected": -1.5461376905441284, + "logps/chosen": -656.3353881835938, + "logps/rejected": -642.3773193359375, + "loss": 0.6118, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.472480297088623, + "rewards/margins": 0.1450308859348297, + "rewards/rejected": -1.61751127243042, + "step": 415 + }, + { + "epoch": 0.2716956486243775, + "grad_norm": 19.144666486903013, + "learning_rate": 1.3699543493372047e-07, + "logits/chosen": -1.69258713722229, + "logits/rejected": -1.7538609504699707, + "logps/chosen": -628.3487548828125, + "logps/rejected": -619.4342651367188, + "loss": 0.5694, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.569846510887146, + "rewards/margins": 0.38913729786872864, + "rewards/rejected": -1.9589838981628418, + "step": 416 + }, + { + "epoch": 0.2723487631643399, + "grad_norm": 27.797390348868447, + "learning_rate": 1.3689897575239766e-07, + "logits/chosen": -1.5702205896377563, + "logits/rejected": -1.602936863899231, + "logps/chosen": -667.676513671875, + "logps/rejected": -693.8980712890625, + "loss": 0.5853, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.529921531677246, + "rewards/margins": 0.45492294430732727, + "rewards/rejected": -1.9848445653915405, + "step": 417 + }, + { + "epoch": 0.2730018777043024, + "grad_norm": 8.991724030392819, + "learning_rate": 1.3680219437878805e-07, + "logits/chosen": -1.645481824874878, + "logits/rejected": -1.6687901020050049, + "logps/chosen": -626.5555419921875, + "logps/rejected": -623.034912109375, + "loss": 0.6408, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4581669569015503, + "rewards/margins": 0.075958251953125, + "rewards/rejected": -1.5341250896453857, + "step": 418 + }, + { + "epoch": 0.2736549922442648, + "grad_norm": 34.54942489548843, + "learning_rate": 1.3670509131665145e-07, + "logits/chosen": -1.6880152225494385, + "logits/rejected": -1.7365965843200684, + "logps/chosen": -684.6690673828125, + "logps/rejected": -660.1009521484375, + "loss": 0.6038, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.548675298690796, + "rewards/margins": 0.23877474665641785, + "rewards/rejected": -1.7874499559402466, + "step": 419 + }, + { + "epoch": 0.2743081067842273, + "grad_norm": 7.725802049377969, + "learning_rate": 1.36607667071422e-07, + "logits/chosen": -1.60299551486969, + "logits/rejected": -1.5734202861785889, + "logps/chosen": -566.0638427734375, + "logps/rejected": -681.6033325195312, + "loss": 0.5367, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.342764973640442, + "rewards/margins": 0.6366487741470337, + "rewards/rejected": -1.9794137477874756, + "step": 420 + }, + { + "epoch": 0.2749612213241897, + "grad_norm": 29.01805274332031, + "learning_rate": 1.3650992215020568e-07, + "logits/chosen": -1.6936869621276855, + "logits/rejected": -1.7092525959014893, + "logps/chosen": -627.9664306640625, + "logps/rejected": -691.7342529296875, + "loss": 0.5607, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4970836639404297, + "rewards/margins": 0.4171164631843567, + "rewards/rejected": -1.9142000675201416, + "step": 421 + }, + { + "epoch": 0.2756143358641522, + "grad_norm": 16.315855308400074, + "learning_rate": 1.3641185706177758e-07, + "logits/chosen": -1.6888682842254639, + "logits/rejected": -1.749967336654663, + "logps/chosen": -697.9429321289062, + "logps/rejected": -695.5234985351562, + "loss": 0.5916, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7964341640472412, + "rewards/margins": 0.21294526755809784, + "rewards/rejected": -2.0093793869018555, + "step": 422 + }, + { + "epoch": 0.2762674504041146, + "grad_norm": 14.532459170681967, + "learning_rate": 1.3631347231657941e-07, + "logits/chosen": -1.5714447498321533, + "logits/rejected": -1.5943671464920044, + "logps/chosen": -706.9442138671875, + "logps/rejected": -723.3374633789062, + "loss": 0.5472, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.754028558731079, + "rewards/margins": 0.30723437666893005, + "rewards/rejected": -2.061262845993042, + "step": 423 + }, + { + "epoch": 0.2769205649440771, + "grad_norm": 17.1924352317544, + "learning_rate": 1.3621476842671663e-07, + "logits/chosen": -1.5889177322387695, + "logits/rejected": -1.607007384300232, + "logps/chosen": -727.8897705078125, + "logps/rejected": -668.5709838867188, + "loss": 0.5935, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.8610236644744873, + "rewards/margins": 0.0675252377986908, + "rewards/rejected": -1.92854905128479, + "step": 424 + }, + { + "epoch": 0.2775736794840395, + "grad_norm": 40.939411419121505, + "learning_rate": 1.3611574590595592e-07, + "logits/chosen": -1.6908094882965088, + "logits/rejected": -1.6986981630325317, + "logps/chosen": -739.2138671875, + "logps/rejected": -840.9638061523438, + "loss": 0.5882, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.968212604522705, + "rewards/margins": 0.6422140002250671, + "rewards/rejected": -2.610426425933838, + "step": 425 + }, + { + "epoch": 0.27822679402400197, + "grad_norm": 7.3972481534822405, + "learning_rate": 1.3601640526972256e-07, + "logits/chosen": -1.6408872604370117, + "logits/rejected": -1.6618748903274536, + "logps/chosen": -625.466796875, + "logps/rejected": -732.3475341796875, + "loss": 0.56, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.500174641609192, + "rewards/margins": 0.6626629829406738, + "rewards/rejected": -2.162837505340576, + "step": 426 + }, + { + "epoch": 0.2788799085639644, + "grad_norm": 17.28581442914079, + "learning_rate": 1.3591674703509755e-07, + "logits/chosen": -1.6025550365447998, + "logits/rejected": -1.6443158388137817, + "logps/chosen": -708.13232421875, + "logps/rejected": -776.6422729492188, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7544456720352173, + "rewards/margins": 0.7180143594741821, + "rewards/rejected": -2.4724600315093994, + "step": 427 + }, + { + "epoch": 0.27953302310392686, + "grad_norm": 21.933643031973503, + "learning_rate": 1.3581677172081503e-07, + "logits/chosen": -1.705406904220581, + "logits/rejected": -1.7163548469543457, + "logps/chosen": -776.873291015625, + "logps/rejected": -821.5286865234375, + "loss": 0.5636, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0922696590423584, + "rewards/margins": 0.3925325274467468, + "rewards/rejected": -2.484802484512329, + "step": 428 + }, + { + "epoch": 0.2801861376438893, + "grad_norm": 9.981810314745449, + "learning_rate": 1.3571647984725965e-07, + "logits/chosen": -1.5826154947280884, + "logits/rejected": -1.5522079467773438, + "logps/chosen": -658.821044921875, + "logps/rejected": -708.5677490234375, + "loss": 0.5996, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5901877880096436, + "rewards/margins": 0.4858096241950989, + "rewards/rejected": -2.0759973526000977, + "step": 429 + }, + { + "epoch": 0.28083925218385175, + "grad_norm": 11.36893380136887, + "learning_rate": 1.3561587193646377e-07, + "logits/chosen": -1.5808284282684326, + "logits/rejected": -1.61090087890625, + "logps/chosen": -645.9747314453125, + "logps/rejected": -630.6502685546875, + "loss": 0.6277, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.67696213722229, + "rewards/margins": 0.11575008928775787, + "rewards/rejected": -1.7927122116088867, + "step": 430 + }, + { + "epoch": 0.28149236672381417, + "grad_norm": 65.85082801878511, + "learning_rate": 1.355149485121048e-07, + "logits/chosen": -1.618956446647644, + "logits/rejected": -1.6172845363616943, + "logps/chosen": -570.473876953125, + "logps/rejected": -637.30126953125, + "loss": 0.5954, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5234228372573853, + "rewards/margins": 0.49459823966026306, + "rewards/rejected": -2.0180211067199707, + "step": 431 + }, + { + "epoch": 0.28214548126377664, + "grad_norm": 16.266142416859648, + "learning_rate": 1.3541371009950234e-07, + "logits/chosen": -1.552247166633606, + "logits/rejected": -1.580383539199829, + "logps/chosen": -720.5084228515625, + "logps/rejected": -765.5215454101562, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.807952880859375, + "rewards/margins": 0.48564061522483826, + "rewards/rejected": -2.293593406677246, + "step": 432 + }, + { + "epoch": 0.28279859580373906, + "grad_norm": 42.15718871671079, + "learning_rate": 1.3531215722561562e-07, + "logits/chosen": -1.6510796546936035, + "logits/rejected": -1.6283471584320068, + "logps/chosen": -709.1431884765625, + "logps/rejected": -720.7800903320312, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7743120193481445, + "rewards/margins": 0.24123550951480865, + "rewards/rejected": -2.015547513961792, + "step": 433 + }, + { + "epoch": 0.28345171034370154, + "grad_norm": 19.658520116756474, + "learning_rate": 1.3521029041904067e-07, + "logits/chosen": -1.7421890497207642, + "logits/rejected": -1.7447417974472046, + "logps/chosen": -633.9954833984375, + "logps/rejected": -745.4783325195312, + "loss": 0.5583, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7052303552627563, + "rewards/margins": 0.5718401074409485, + "rewards/rejected": -2.2770705223083496, + "step": 434 + }, + { + "epoch": 0.28410482488366395, + "grad_norm": 61.59792122699335, + "learning_rate": 1.351081102100076e-07, + "logits/chosen": -1.6574376821517944, + "logits/rejected": -1.6252573728561401, + "logps/chosen": -680.404541015625, + "logps/rejected": -763.9954833984375, + "loss": 0.5739, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7275390625, + "rewards/margins": 0.38947176933288574, + "rewards/rejected": -2.117011070251465, + "step": 435 + }, + { + "epoch": 0.2847579394236264, + "grad_norm": 7.980201871732804, + "learning_rate": 1.3500561713037777e-07, + "logits/chosen": -1.6319736242294312, + "logits/rejected": -1.6381350755691528, + "logps/chosen": -719.6119995117188, + "logps/rejected": -745.8162841796875, + "loss": 0.5692, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7299431562423706, + "rewards/margins": 0.5405523180961609, + "rewards/rejected": -2.270495653152466, + "step": 436 + }, + { + "epoch": 0.28541105396358885, + "grad_norm": 8.727533392873836, + "learning_rate": 1.3490281171364112e-07, + "logits/chosen": -1.6387797594070435, + "logits/rejected": -1.6695307493209839, + "logps/chosen": -695.0264892578125, + "logps/rejected": -734.383544921875, + "loss": 0.5652, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7570171356201172, + "rewards/margins": 0.508963942527771, + "rewards/rejected": -2.2659811973571777, + "step": 437 + }, + { + "epoch": 0.2860641685035513, + "grad_norm": 28.528027325398355, + "learning_rate": 1.3479969449491332e-07, + "logits/chosen": -1.600303053855896, + "logits/rejected": -1.5852181911468506, + "logps/chosen": -685.3209838867188, + "logps/rejected": -876.8265380859375, + "loss": 0.5698, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7708958387374878, + "rewards/margins": 0.7778002619743347, + "rewards/rejected": -2.5486960411071777, + "step": 438 + }, + { + "epoch": 0.28671728304351374, + "grad_norm": 14.87235543409777, + "learning_rate": 1.3469626601093301e-07, + "logits/chosen": -1.5701959133148193, + "logits/rejected": -1.6426581144332886, + "logps/chosen": -664.4986572265625, + "logps/rejected": -663.914794921875, + "loss": 0.6061, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8818445205688477, + "rewards/margins": 0.31387796998023987, + "rewards/rejected": -2.1957225799560547, + "step": 439 + }, + { + "epoch": 0.2873703975834762, + "grad_norm": 10.063735379517492, + "learning_rate": 1.34592526800059e-07, + "logits/chosen": -1.6531732082366943, + "logits/rejected": -1.7189072370529175, + "logps/chosen": -714.6017456054688, + "logps/rejected": -692.885498046875, + "loss": 0.5671, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7326029539108276, + "rewards/margins": 0.28911852836608887, + "rewards/rejected": -2.021721363067627, + "step": 440 + }, + { + "epoch": 0.28802351212343863, + "grad_norm": 9.184629537387579, + "learning_rate": 1.3448847740226753e-07, + "logits/chosen": -1.4753351211547852, + "logits/rejected": -1.5424079895019531, + "logps/chosen": -621.2201538085938, + "logps/rejected": -637.651611328125, + "loss": 0.6113, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6852307319641113, + "rewards/margins": 0.3220943510532379, + "rewards/rejected": -2.0073251724243164, + "step": 441 + }, + { + "epoch": 0.2886766266634011, + "grad_norm": 8.636425870814191, + "learning_rate": 1.3438411835914935e-07, + "logits/chosen": -1.689327597618103, + "logits/rejected": -1.6427044868469238, + "logps/chosen": -652.803466796875, + "logps/rejected": -845.2495727539062, + "loss": 0.5666, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6894198656082153, + "rewards/margins": 0.8975628614425659, + "rewards/rejected": -2.5869827270507812, + "step": 442 + }, + { + "epoch": 0.2893297412033635, + "grad_norm": 10.50323658639023, + "learning_rate": 1.3427945021390695e-07, + "logits/chosen": -1.7043615579605103, + "logits/rejected": -1.6875221729278564, + "logps/chosen": -757.1970825195312, + "logps/rejected": -846.8417358398438, + "loss": 0.5605, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8582996129989624, + "rewards/margins": 0.7164192795753479, + "rewards/rejected": -2.574718952178955, + "step": 443 + }, + { + "epoch": 0.289982855743326, + "grad_norm": 58.021997546888514, + "learning_rate": 1.3417447351135174e-07, + "logits/chosen": -1.6309140920639038, + "logits/rejected": -1.5757015943527222, + "logps/chosen": -656.53466796875, + "logps/rejected": -700.3538818359375, + "loss": 0.556, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6346073150634766, + "rewards/margins": 0.3053905665874481, + "rewards/rejected": -1.939997911453247, + "step": 444 + }, + { + "epoch": 0.2906359702832884, + "grad_norm": 23.765137933971218, + "learning_rate": 1.3406918879790125e-07, + "logits/chosen": -1.604915976524353, + "logits/rejected": -1.5979593992233276, + "logps/chosen": -678.1114501953125, + "logps/rejected": -687.6436767578125, + "loss": 0.5872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8041229248046875, + "rewards/margins": 0.2883044183254242, + "rewards/rejected": -2.0924274921417236, + "step": 445 + }, + { + "epoch": 0.2912890848232509, + "grad_norm": 9.50417097205747, + "learning_rate": 1.3396359662157621e-07, + "logits/chosen": -1.625780463218689, + "logits/rejected": -1.6541712284088135, + "logps/chosen": -595.81396484375, + "logps/rejected": -702.814453125, + "loss": 0.544, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.58511221408844, + "rewards/margins": 0.7033818364143372, + "rewards/rejected": -2.288494110107422, + "step": 446 + }, + { + "epoch": 0.2919421993632133, + "grad_norm": 18.981324007656344, + "learning_rate": 1.3385769753199778e-07, + "logits/chosen": -1.6669667959213257, + "logits/rejected": -1.6522274017333984, + "logps/chosen": -686.9908447265625, + "logps/rejected": -650.0782470703125, + "loss": 0.5556, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9364535808563232, + "rewards/margins": 0.2179185450077057, + "rewards/rejected": -2.154372215270996, + "step": 447 + }, + { + "epoch": 0.2925953139031758, + "grad_norm": 10.169059802410512, + "learning_rate": 1.3375149208038454e-07, + "logits/chosen": -1.6317005157470703, + "logits/rejected": -1.6902613639831543, + "logps/chosen": -641.2767944335938, + "logps/rejected": -646.5491943359375, + "loss": 0.5492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7388436794281006, + "rewards/margins": 0.43374961614608765, + "rewards/rejected": -2.172593355178833, + "step": 448 + }, + { + "epoch": 0.2932484284431382, + "grad_norm": 49.39528712371372, + "learning_rate": 1.3364498081954984e-07, + "logits/chosen": -1.6534464359283447, + "logits/rejected": -1.6478941440582275, + "logps/chosen": -652.5145263671875, + "logps/rejected": -719.8404541015625, + "loss": 0.5349, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.6955946683883667, + "rewards/margins": 0.6352224349975586, + "rewards/rejected": -2.3308169841766357, + "step": 449 + }, + { + "epoch": 0.29390154298310067, + "grad_norm": 22.612708126765373, + "learning_rate": 1.3353816430389877e-07, + "logits/chosen": -1.6020948886871338, + "logits/rejected": -1.5905299186706543, + "logps/chosen": -640.2467041015625, + "logps/rejected": -654.7265625, + "loss": 0.5702, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7803701162338257, + "rewards/margins": 0.2993907928466797, + "rewards/rejected": -2.079760789871216, + "step": 450 + }, + { + "epoch": 0.2945546575230631, + "grad_norm": 53.155242423002186, + "learning_rate": 1.3343104308942526e-07, + "logits/chosen": -1.5806388854980469, + "logits/rejected": -1.6192755699157715, + "logps/chosen": -638.1632080078125, + "logps/rejected": -661.924072265625, + "loss": 0.5966, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5359299182891846, + "rewards/margins": 0.4746638536453247, + "rewards/rejected": -2.0105934143066406, + "step": 451 + }, + { + "epoch": 0.29520777206302556, + "grad_norm": 27.43879161964594, + "learning_rate": 1.3332361773370933e-07, + "logits/chosen": -1.615262746810913, + "logits/rejected": -1.618033766746521, + "logps/chosen": -684.5283203125, + "logps/rejected": -777.6100463867188, + "loss": 0.5537, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.678364634513855, + "rewards/margins": 0.7497380971908569, + "rewards/rejected": -2.428102731704712, + "step": 452 + }, + { + "epoch": 0.295860886602988, + "grad_norm": 90.50268890044883, + "learning_rate": 1.3321588879591404e-07, + "logits/chosen": -1.6059372425079346, + "logits/rejected": -1.646728754043579, + "logps/chosen": -712.547119140625, + "logps/rejected": -678.2857666015625, + "loss": 0.6262, + "rewards/accuracies": 0.46875, + "rewards/chosen": -2.0923614501953125, + "rewards/margins": 0.0826776921749115, + "rewards/rejected": -2.175039291381836, + "step": 453 + }, + { + "epoch": 0.29651400114295046, + "grad_norm": 23.544890284443007, + "learning_rate": 1.331078568367826e-07, + "logits/chosen": -1.6115236282348633, + "logits/rejected": -1.6034373044967651, + "logps/chosen": -676.42529296875, + "logps/rejected": -732.385009765625, + "loss": 0.5661, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7986388206481934, + "rewards/margins": 0.3738027811050415, + "rewards/rejected": -2.1724417209625244, + "step": 454 + }, + { + "epoch": 0.2971671156829129, + "grad_norm": 20.85817286901651, + "learning_rate": 1.3299952241863558e-07, + "logits/chosen": -1.547767996788025, + "logits/rejected": -1.5575647354125977, + "logps/chosen": -677.2283325195312, + "logps/rejected": -753.81201171875, + "loss": 0.5791, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8230202198028564, + "rewards/margins": 0.49635809659957886, + "rewards/rejected": -2.319378137588501, + "step": 455 + }, + { + "epoch": 0.29782023022287535, + "grad_norm": 83.24183783338488, + "learning_rate": 1.3289088610536775e-07, + "logits/chosen": -1.704686164855957, + "logits/rejected": -1.7018077373504639, + "logps/chosen": -748.0383911132812, + "logps/rejected": -787.422607421875, + "loss": 0.5771, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.033522605895996, + "rewards/margins": 0.40751010179519653, + "rewards/rejected": -2.441032886505127, + "step": 456 + }, + { + "epoch": 0.29847334476283777, + "grad_norm": 9.918684990634835, + "learning_rate": 1.3278194846244547e-07, + "logits/chosen": -1.6032931804656982, + "logits/rejected": -1.631669282913208, + "logps/chosen": -650.1741943359375, + "logps/rejected": -619.4449462890625, + "loss": 0.5868, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0186896324157715, + "rewards/margins": 0.09990565478801727, + "rewards/rejected": -2.1185953617095947, + "step": 457 + }, + { + "epoch": 0.29912645930280024, + "grad_norm": 8.930483236743907, + "learning_rate": 1.326727100569034e-07, + "logits/chosen": -1.5541692972183228, + "logits/rejected": -1.5824774503707886, + "logps/chosen": -660.3080444335938, + "logps/rejected": -716.2559814453125, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9189565181732178, + "rewards/margins": 0.3501700758934021, + "rewards/rejected": -2.2691266536712646, + "step": 458 + }, + { + "epoch": 0.29977957384276266, + "grad_norm": 27.815139403173614, + "learning_rate": 1.3256317145734176e-07, + "logits/chosen": -1.6188397407531738, + "logits/rejected": -1.6419901847839355, + "logps/chosen": -769.3443603515625, + "logps/rejected": -774.8818969726562, + "loss": 0.5426, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.362337350845337, + "rewards/margins": 0.2768968641757965, + "rewards/rejected": -2.6392343044281006, + "step": 459 + }, + { + "epoch": 0.30043268838272513, + "grad_norm": 32.322236847979745, + "learning_rate": 1.3245333323392333e-07, + "logits/chosen": -1.575210452079773, + "logits/rejected": -1.5853873491287231, + "logps/chosen": -663.4458618164062, + "logps/rejected": -697.9381103515625, + "loss": 0.5825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7677443027496338, + "rewards/margins": 0.5210894346237183, + "rewards/rejected": -2.2888338565826416, + "step": 460 + }, + { + "epoch": 0.30108580292268755, + "grad_norm": 39.148266264726594, + "learning_rate": 1.3234319595837053e-07, + "logits/chosen": -1.6241929531097412, + "logits/rejected": -1.633462905883789, + "logps/chosen": -698.6934814453125, + "logps/rejected": -783.1587524414062, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.971913456916809, + "rewards/margins": 0.39414018392562866, + "rewards/rejected": -2.366053581237793, + "step": 461 + }, + { + "epoch": 0.30173891746265, + "grad_norm": 11.953653868367251, + "learning_rate": 1.3223276020396224e-07, + "logits/chosen": -1.6563340425491333, + "logits/rejected": -1.6382731199264526, + "logps/chosen": -750.8363647460938, + "logps/rejected": -918.314453125, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.174654722213745, + "rewards/margins": 0.9784154891967773, + "rewards/rejected": -3.1530702114105225, + "step": 462 + }, + { + "epoch": 0.30239203200261244, + "grad_norm": 11.156052343735718, + "learning_rate": 1.3212202654553108e-07, + "logits/chosen": -1.6498849391937256, + "logits/rejected": -1.611333966255188, + "logps/chosen": -697.7623291015625, + "logps/rejected": -808.7626953125, + "loss": 0.5996, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0645065307617188, + "rewards/margins": 0.5361623167991638, + "rewards/rejected": -2.6006689071655273, + "step": 463 + }, + { + "epoch": 0.3030451465425749, + "grad_norm": 9.56202472673271, + "learning_rate": 1.3201099555946027e-07, + "logits/chosen": -1.5961620807647705, + "logits/rejected": -1.6157381534576416, + "logps/chosen": -695.4110717773438, + "logps/rejected": -687.124755859375, + "loss": 0.5877, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.75066339969635, + "rewards/margins": 0.38458389043807983, + "rewards/rejected": -2.135247230529785, + "step": 464 + }, + { + "epoch": 0.30369826108253734, + "grad_norm": 16.37722781419385, + "learning_rate": 1.3189966782368067e-07, + "logits/chosen": -1.478019118309021, + "logits/rejected": -1.5088088512420654, + "logps/chosen": -654.6697998046875, + "logps/rejected": -701.0975952148438, + "loss": 0.5274, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9592812061309814, + "rewards/margins": 0.3919811546802521, + "rewards/rejected": -2.351262331008911, + "step": 465 + }, + { + "epoch": 0.3043513756224998, + "grad_norm": 48.09748648700373, + "learning_rate": 1.3178804391766773e-07, + "logits/chosen": -1.6397638320922852, + "logits/rejected": -1.6730685234069824, + "logps/chosen": -689.5550537109375, + "logps/rejected": -693.6464233398438, + "loss": 0.5573, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6381131410598755, + "rewards/margins": 0.5276268720626831, + "rewards/rejected": -2.1657400131225586, + "step": 466 + }, + { + "epoch": 0.3050044901624622, + "grad_norm": 38.21088887373832, + "learning_rate": 1.3167612442243849e-07, + "logits/chosen": -1.6109600067138672, + "logits/rejected": -1.588073968887329, + "logps/chosen": -679.6129760742188, + "logps/rejected": -752.5172119140625, + "loss": 0.5441, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.768452525138855, + "rewards/margins": 0.5461674332618713, + "rewards/rejected": -2.314620018005371, + "step": 467 + }, + { + "epoch": 0.3056576047024247, + "grad_norm": 33.132628684957666, + "learning_rate": 1.3156390992054862e-07, + "logits/chosen": -1.6505910158157349, + "logits/rejected": -1.6523258686065674, + "logps/chosen": -707.0516357421875, + "logps/rejected": -742.3071899414062, + "loss": 0.5618, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.175845146179199, + "rewards/margins": 0.5027600526809692, + "rewards/rejected": -2.678605318069458, + "step": 468 + }, + { + "epoch": 0.3063107192423871, + "grad_norm": 13.439285792051546, + "learning_rate": 1.3145140099608933e-07, + "logits/chosen": -1.6851032972335815, + "logits/rejected": -1.6651477813720703, + "logps/chosen": -708.4338989257812, + "logps/rejected": -813.6752319335938, + "loss": 0.5774, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.052492380142212, + "rewards/margins": 0.5105604529380798, + "rewards/rejected": -2.5630528926849365, + "step": 469 + }, + { + "epoch": 0.3069638337823496, + "grad_norm": 21.620672217002696, + "learning_rate": 1.3133859823468433e-07, + "logits/chosen": -1.5591509342193604, + "logits/rejected": -1.620843768119812, + "logps/chosen": -712.449951171875, + "logps/rejected": -747.5558471679688, + "loss": 0.561, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.18332576751709, + "rewards/margins": 0.5399474501609802, + "rewards/rejected": -2.7232730388641357, + "step": 470 + }, + { + "epoch": 0.307616948322312, + "grad_norm": 7.80149433228074, + "learning_rate": 1.3122550222348676e-07, + "logits/chosen": -1.6145565509796143, + "logits/rejected": -1.612226128578186, + "logps/chosen": -602.789794921875, + "logps/rejected": -650.0497436523438, + "loss": 0.5333, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8238171339035034, + "rewards/margins": 0.18020805716514587, + "rewards/rejected": -2.0040249824523926, + "step": 471 + }, + { + "epoch": 0.3082700628622745, + "grad_norm": 12.541386961248598, + "learning_rate": 1.3111211355117625e-07, + "logits/chosen": -1.7079181671142578, + "logits/rejected": -1.6550383567810059, + "logps/chosen": -733.6851806640625, + "logps/rejected": -804.6192626953125, + "loss": 0.5624, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.264955520629883, + "rewards/margins": 0.5682797431945801, + "rewards/rejected": -2.833235263824463, + "step": 472 + }, + { + "epoch": 0.3089231774022369, + "grad_norm": 49.77231044522242, + "learning_rate": 1.3099843280795564e-07, + "logits/chosen": -1.5972816944122314, + "logits/rejected": -1.5726184844970703, + "logps/chosen": -697.912109375, + "logps/rejected": -767.2205810546875, + "loss": 0.5352, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8209738731384277, + "rewards/margins": 0.34553292393684387, + "rewards/rejected": -2.16650652885437, + "step": 473 + }, + { + "epoch": 0.3095762919421994, + "grad_norm": 9.964229839524728, + "learning_rate": 1.3088446058554813e-07, + "logits/chosen": -1.5458612442016602, + "logits/rejected": -1.533179759979248, + "logps/chosen": -707.8756103515625, + "logps/rejected": -787.054443359375, + "loss": 0.5863, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.043527364730835, + "rewards/margins": 0.6540895700454712, + "rewards/rejected": -2.6976165771484375, + "step": 474 + }, + { + "epoch": 0.3102294064821618, + "grad_norm": 45.959503738015734, + "learning_rate": 1.3077019747719412e-07, + "logits/chosen": -1.595198154449463, + "logits/rejected": -1.591080904006958, + "logps/chosen": -645.2706298828125, + "logps/rejected": -711.2510986328125, + "loss": 0.5782, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0279712677001953, + "rewards/margins": 0.44245079159736633, + "rewards/rejected": -2.4704222679138184, + "step": 475 + }, + { + "epoch": 0.31088252102212427, + "grad_norm": 44.73324911306505, + "learning_rate": 1.3065564407764802e-07, + "logits/chosen": -1.7171953916549683, + "logits/rejected": -1.707771897315979, + "logps/chosen": -780.519775390625, + "logps/rejected": -787.8773193359375, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1023976802825928, + "rewards/margins": 0.306552529335022, + "rewards/rejected": -2.4089503288269043, + "step": 476 + }, + { + "epoch": 0.3115356355620867, + "grad_norm": 48.62571818947248, + "learning_rate": 1.3054080098317535e-07, + "logits/chosen": -1.6242291927337646, + "logits/rejected": -1.604690432548523, + "logps/chosen": -631.8426513671875, + "logps/rejected": -682.6934204101562, + "loss": 0.5734, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8018213510513306, + "rewards/margins": 0.6165929436683655, + "rewards/rejected": -2.418414354324341, + "step": 477 + }, + { + "epoch": 0.31218875010204916, + "grad_norm": 12.529349998561317, + "learning_rate": 1.3042566879154942e-07, + "logits/chosen": -1.6056309938430786, + "logits/rejected": -1.5826270580291748, + "logps/chosen": -729.5185546875, + "logps/rejected": -871.41064453125, + "loss": 0.5912, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2389609813690186, + "rewards/margins": 0.8720102310180664, + "rewards/rejected": -3.110971450805664, + "step": 478 + }, + { + "epoch": 0.3128418646420116, + "grad_norm": 25.761326367725754, + "learning_rate": 1.3031024810204844e-07, + "logits/chosen": -1.652268648147583, + "logits/rejected": -1.6630209684371948, + "logps/chosen": -703.5462036132812, + "logps/rejected": -724.381591796875, + "loss": 0.5784, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.003835439682007, + "rewards/margins": 0.37049388885498047, + "rewards/rejected": -2.3743293285369873, + "step": 479 + }, + { + "epoch": 0.31349497918197405, + "grad_norm": 33.561886992169335, + "learning_rate": 1.3019453951545222e-07, + "logits/chosen": -1.5987491607666016, + "logits/rejected": -1.5921955108642578, + "logps/chosen": -745.3104248046875, + "logps/rejected": -764.4356689453125, + "loss": 0.5442, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0976104736328125, + "rewards/margins": 0.27280157804489136, + "rewards/rejected": -2.3704121112823486, + "step": 480 + }, + { + "epoch": 0.31414809372193647, + "grad_norm": 36.917374177404064, + "learning_rate": 1.3007854363403912e-07, + "logits/chosen": -1.4915146827697754, + "logits/rejected": -1.4882618188858032, + "logps/chosen": -751.360107421875, + "logps/rejected": -766.1044311523438, + "loss": 0.5453, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2802181243896484, + "rewards/margins": 0.4815993905067444, + "rewards/rejected": -2.761817693710327, + "step": 481 + }, + { + "epoch": 0.31480120826189895, + "grad_norm": 13.711790140200893, + "learning_rate": 1.2996226106158292e-07, + "logits/chosen": -1.5975637435913086, + "logits/rejected": -1.6035072803497314, + "logps/chosen": -755.90576171875, + "logps/rejected": -734.3273315429688, + "loss": 0.5692, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2815115451812744, + "rewards/margins": 0.35683876276016235, + "rewards/rejected": -2.638350486755371, + "step": 482 + }, + { + "epoch": 0.31545432280186136, + "grad_norm": 10.225083803183653, + "learning_rate": 1.2984569240334968e-07, + "logits/chosen": -1.6066551208496094, + "logits/rejected": -1.6023541688919067, + "logps/chosen": -683.2215576171875, + "logps/rejected": -708.5989379882812, + "loss": 0.643, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3892295360565186, + "rewards/margins": 0.2522014081478119, + "rewards/rejected": -2.6414308547973633, + "step": 483 + }, + { + "epoch": 0.31610743734182384, + "grad_norm": 43.21606295702285, + "learning_rate": 1.297288382660945e-07, + "logits/chosen": -1.5631223917007446, + "logits/rejected": -1.5718114376068115, + "logps/chosen": -689.9915771484375, + "logps/rejected": -718.431396484375, + "loss": 0.6106, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1257877349853516, + "rewards/margins": 0.2984415590763092, + "rewards/rejected": -2.424229145050049, + "step": 484 + }, + { + "epoch": 0.31676055188178626, + "grad_norm": 27.88872175196333, + "learning_rate": 1.2961169925805854e-07, + "logits/chosen": -1.5951167345046997, + "logits/rejected": -1.6255998611450195, + "logps/chosen": -788.0588989257812, + "logps/rejected": -845.193603515625, + "loss": 0.5597, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4089677333831787, + "rewards/margins": 0.6144492626190186, + "rewards/rejected": -3.0234172344207764, + "step": 485 + }, + { + "epoch": 0.31741366642174873, + "grad_norm": 20.90987797781216, + "learning_rate": 1.294942759889657e-07, + "logits/chosen": -1.675968050956726, + "logits/rejected": -1.7186676263809204, + "logps/chosen": -748.28466796875, + "logps/rejected": -773.9339599609375, + "loss": 0.5613, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.358844757080078, + "rewards/margins": 0.4122333824634552, + "rewards/rejected": -2.771078109741211, + "step": 486 + }, + { + "epoch": 0.31806678096171115, + "grad_norm": 55.17649113640136, + "learning_rate": 1.2937656907001946e-07, + "logits/chosen": -1.641244649887085, + "logits/rejected": -1.6599280834197998, + "logps/chosen": -761.1217041015625, + "logps/rejected": -830.0103149414062, + "loss": 0.5631, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4944777488708496, + "rewards/margins": 0.6374845504760742, + "rewards/rejected": -3.131962299346924, + "step": 487 + }, + { + "epoch": 0.3187198955016736, + "grad_norm": 102.83841154795559, + "learning_rate": 1.2925857911389977e-07, + "logits/chosen": -1.5266590118408203, + "logits/rejected": -1.5372052192687988, + "logps/chosen": -713.9895629882812, + "logps/rejected": -758.6173706054688, + "loss": 0.5926, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.067730188369751, + "rewards/margins": 0.3173166811466217, + "rewards/rejected": -2.3850467205047607, + "step": 488 + }, + { + "epoch": 0.31937301004163604, + "grad_norm": 16.29290504478057, + "learning_rate": 1.2914030673475987e-07, + "logits/chosen": -1.6052075624465942, + "logits/rejected": -1.6125444173812866, + "logps/chosen": -788.452392578125, + "logps/rejected": -915.9051513671875, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4514353275299072, + "rewards/margins": 0.7415469884872437, + "rewards/rejected": -3.1929819583892822, + "step": 489 + }, + { + "epoch": 0.3200261245815985, + "grad_norm": 10.250010376773696, + "learning_rate": 1.29021752548223e-07, + "logits/chosen": -1.5654120445251465, + "logits/rejected": -1.5245580673217773, + "logps/chosen": -656.7188720703125, + "logps/rejected": -755.3834838867188, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9858925342559814, + "rewards/margins": 0.6740937829017639, + "rewards/rejected": -2.6599864959716797, + "step": 490 + }, + { + "epoch": 0.32067923912156093, + "grad_norm": 26.08722651165643, + "learning_rate": 1.2890291717137919e-07, + "logits/chosen": -1.6398707628250122, + "logits/rejected": -1.6489192247390747, + "logps/chosen": -740.1271362304688, + "logps/rejected": -744.15576171875, + "loss": 0.5919, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0526180267333984, + "rewards/margins": 0.4571646749973297, + "rewards/rejected": -2.509782552719116, + "step": 491 + }, + { + "epoch": 0.3213323536615234, + "grad_norm": 11.326643733648055, + "learning_rate": 1.287838012227822e-07, + "logits/chosen": -1.4910850524902344, + "logits/rejected": -1.5168672800064087, + "logps/chosen": -666.2679443359375, + "logps/rejected": -719.11328125, + "loss": 0.5579, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.113301992416382, + "rewards/margins": 0.3083459138870239, + "rewards/rejected": -2.4216480255126953, + "step": 492 + }, + { + "epoch": 0.3219854682014858, + "grad_norm": 10.301719458410284, + "learning_rate": 1.2866440532244618e-07, + "logits/chosen": -1.6052420139312744, + "logits/rejected": -1.5893734693527222, + "logps/chosen": -680.7243041992188, + "logps/rejected": -705.3173828125, + "loss": 0.5927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.155557632446289, + "rewards/margins": 0.13128241896629333, + "rewards/rejected": -2.2868399620056152, + "step": 493 + }, + { + "epoch": 0.3226385827414483, + "grad_norm": 23.898710768355055, + "learning_rate": 1.2854473009184242e-07, + "logits/chosen": -1.582448959350586, + "logits/rejected": -1.6095366477966309, + "logps/chosen": -674.9298095703125, + "logps/rejected": -756.9193725585938, + "loss": 0.5823, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0278987884521484, + "rewards/margins": 0.5040706992149353, + "rewards/rejected": -2.5319693088531494, + "step": 494 + }, + { + "epoch": 0.3232916972814107, + "grad_norm": 38.15944101714436, + "learning_rate": 1.2842477615389622e-07, + "logits/chosen": -1.582186222076416, + "logits/rejected": -1.589294195175171, + "logps/chosen": -693.418701171875, + "logps/rejected": -733.3966064453125, + "loss": 0.5397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.0407207012176514, + "rewards/margins": 0.5694603323936462, + "rewards/rejected": -2.6101813316345215, + "step": 495 + }, + { + "epoch": 0.3239448118213732, + "grad_norm": 14.734546679223891, + "learning_rate": 1.2830454413298353e-07, + "logits/chosen": -1.6934702396392822, + "logits/rejected": -1.692880630493164, + "logps/chosen": -768.6975708007812, + "logps/rejected": -769.83984375, + "loss": 0.5351, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1913037300109863, + "rewards/margins": 0.30963531136512756, + "rewards/rejected": -2.500938892364502, + "step": 496 + }, + { + "epoch": 0.3245979263613356, + "grad_norm": 57.674097527721926, + "learning_rate": 1.2818403465492783e-07, + "logits/chosen": -1.5970431566238403, + "logits/rejected": -1.6256811618804932, + "logps/chosen": -704.041748046875, + "logps/rejected": -755.9532470703125, + "loss": 0.5673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2154181003570557, + "rewards/margins": 0.48861750960350037, + "rewards/rejected": -2.704035520553589, + "step": 497 + }, + { + "epoch": 0.3252510409012981, + "grad_norm": 51.51237394300094, + "learning_rate": 1.280632483469967e-07, + "logits/chosen": -1.601000189781189, + "logits/rejected": -1.615774393081665, + "logps/chosen": -765.2921142578125, + "logps/rejected": -783.3229370117188, + "loss": 0.5797, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.198168992996216, + "rewards/margins": 0.4726821184158325, + "rewards/rejected": -2.670850992202759, + "step": 498 + }, + { + "epoch": 0.3259041554412605, + "grad_norm": 52.55743751092252, + "learning_rate": 1.2794218583789876e-07, + "logits/chosen": -1.6243011951446533, + "logits/rejected": -1.6285080909729004, + "logps/chosen": -646.337890625, + "logps/rejected": -663.8938598632812, + "loss": 0.5854, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0927515029907227, + "rewards/margins": 0.4221267104148865, + "rewards/rejected": -2.514878273010254, + "step": 499 + }, + { + "epoch": 0.326557269981223, + "grad_norm": 40.80464561018148, + "learning_rate": 1.278208477577802e-07, + "logits/chosen": -1.6202638149261475, + "logits/rejected": -1.6449174880981445, + "logps/chosen": -826.0245361328125, + "logps/rejected": -890.9912109375, + "loss": 0.5447, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2344555854797363, + "rewards/margins": 0.7165102958679199, + "rewards/rejected": -2.9509658813476562, + "step": 500 + }, + { + "epoch": 0.326557269981223, + "eval_logits/chosen": -1.6206858158111572, + "eval_logits/rejected": -1.6248681545257568, + "eval_logps/chosen": -728.922119140625, + "eval_logps/rejected": -772.7636108398438, + "eval_loss": 0.5644757151603699, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": -2.1857683658599854, + "eval_rewards/margins": 0.5194733142852783, + "eval_rewards/rejected": -2.7052416801452637, + "eval_runtime": 296.3263, + "eval_samples_per_second": 13.499, + "eval_steps_per_second": 0.844, + "step": 500 + }, + { + "epoch": 0.3272103845211854, + "grad_norm": 18.6916196942294, + "learning_rate": 1.276992347382217e-07, + "logits/chosen": -1.6296683549880981, + "logits/rejected": -1.6314842700958252, + "logps/chosen": -768.372314453125, + "logps/rejected": -827.50634765625, + "loss": 0.5824, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.622889280319214, + "rewards/margins": 0.44725632667541504, + "rewards/rejected": -3.070145606994629, + "step": 501 + }, + { + "epoch": 0.32786349906114787, + "grad_norm": 38.99333241282542, + "learning_rate": 1.2757734741223494e-07, + "logits/chosen": -1.598349690437317, + "logits/rejected": -1.59029221534729, + "logps/chosen": -643.4448852539062, + "logps/rejected": -724.112548828125, + "loss": 0.584, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8339576721191406, + "rewards/margins": 0.5225554704666138, + "rewards/rejected": -2.356513023376465, + "step": 502 + }, + { + "epoch": 0.3285166136011103, + "grad_norm": 57.964045081569594, + "learning_rate": 1.2745518641425945e-07, + "logits/chosen": -1.5579354763031006, + "logits/rejected": -1.5548033714294434, + "logps/chosen": -681.3307495117188, + "logps/rejected": -820.7464599609375, + "loss": 0.559, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9576513767242432, + "rewards/margins": 0.9468222856521606, + "rewards/rejected": -2.9044735431671143, + "step": 503 + }, + { + "epoch": 0.32916972814107276, + "grad_norm": 26.127719689786453, + "learning_rate": 1.2733275238015923e-07, + "logits/chosen": -1.6055808067321777, + "logits/rejected": -1.630063772201538, + "logps/chosen": -629.9222412109375, + "logps/rejected": -704.1810302734375, + "loss": 0.5861, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0422046184539795, + "rewards/margins": 0.5639029741287231, + "rewards/rejected": -2.606107234954834, + "step": 504 + }, + { + "epoch": 0.3298228426810352, + "grad_norm": 26.745925680747586, + "learning_rate": 1.272100459472195e-07, + "logits/chosen": -1.4966247081756592, + "logits/rejected": -1.51353120803833, + "logps/chosen": -733.5635986328125, + "logps/rejected": -757.8365478515625, + "loss": 0.6147, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2747292518615723, + "rewards/margins": 0.3070646822452545, + "rewards/rejected": -2.581793785095215, + "step": 505 + }, + { + "epoch": 0.33047595722099765, + "grad_norm": 15.726026277688065, + "learning_rate": 1.2708706775414333e-07, + "logits/chosen": -1.562856912612915, + "logits/rejected": -1.5949100255966187, + "logps/chosen": -742.255859375, + "logps/rejected": -773.4927978515625, + "loss": 0.5832, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.044926643371582, + "rewards/margins": 0.5169533491134644, + "rewards/rejected": -2.561879873275757, + "step": 506 + }, + { + "epoch": 0.33112907176096007, + "grad_norm": 18.237745759610128, + "learning_rate": 1.269638184410483e-07, + "logits/chosen": -1.6007072925567627, + "logits/rejected": -1.627900242805481, + "logps/chosen": -760.51708984375, + "logps/rejected": -827.0796508789062, + "loss": 0.5619, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.159515619277954, + "rewards/margins": 0.579525887966156, + "rewards/rejected": -2.739041566848755, + "step": 507 + }, + { + "epoch": 0.33178218630092254, + "grad_norm": 10.09970310506052, + "learning_rate": 1.2684029864946334e-07, + "logits/chosen": -1.4610607624053955, + "logits/rejected": -1.4745755195617676, + "logps/chosen": -622.4526977539062, + "logps/rejected": -639.34814453125, + "loss": 0.5861, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.053276538848877, + "rewards/margins": 0.30753716826438904, + "rewards/rejected": -2.360813617706299, + "step": 508 + }, + { + "epoch": 0.33243530084088496, + "grad_norm": 29.561202029180375, + "learning_rate": 1.2671650902232512e-07, + "logits/chosen": -1.5951420068740845, + "logits/rejected": -1.6463465690612793, + "logps/chosen": -701.150634765625, + "logps/rejected": -748.044921875, + "loss": 0.5395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1120245456695557, + "rewards/margins": 0.4306832551956177, + "rewards/rejected": -2.542707681655884, + "step": 509 + }, + { + "epoch": 0.33308841538084744, + "grad_norm": 20.140045301365248, + "learning_rate": 1.2659245020397487e-07, + "logits/chosen": -1.6009819507598877, + "logits/rejected": -1.6235804557800293, + "logps/chosen": -767.361083984375, + "logps/rejected": -796.4589233398438, + "loss": 0.6337, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.34623646736145, + "rewards/margins": 0.62890625, + "rewards/rejected": -2.975142478942871, + "step": 510 + }, + { + "epoch": 0.33374152992080985, + "grad_norm": 20.816657867687876, + "learning_rate": 1.2646812284015502e-07, + "logits/chosen": -1.5827473402023315, + "logits/rejected": -1.5781760215759277, + "logps/chosen": -730.8731079101562, + "logps/rejected": -745.44775390625, + "loss": 0.5493, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1604835987091064, + "rewards/margins": 0.3516397774219513, + "rewards/rejected": -2.5121235847473145, + "step": 511 + }, + { + "epoch": 0.33439464446077233, + "grad_norm": 27.37026270313324, + "learning_rate": 1.263435275780058e-07, + "logits/chosen": -1.5628294944763184, + "logits/rejected": -1.5843901634216309, + "logps/chosen": -666.028076171875, + "logps/rejected": -709.0813598632812, + "loss": 0.646, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.901466965675354, + "rewards/margins": 0.44154587388038635, + "rewards/rejected": -2.343013048171997, + "step": 512 + }, + { + "epoch": 0.33504775900073475, + "grad_norm": 35.16457922567049, + "learning_rate": 1.262186650660619e-07, + "logits/chosen": -1.5470932722091675, + "logits/rejected": -1.5630905628204346, + "logps/chosen": -718.0603637695312, + "logps/rejected": -732.3851928710938, + "loss": 0.5527, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.389463424682617, + "rewards/margins": 0.3482608497142792, + "rewards/rejected": -2.7377243041992188, + "step": 513 + }, + { + "epoch": 0.3357008735406972, + "grad_norm": 70.3813885396455, + "learning_rate": 1.2609353595424905e-07, + "logits/chosen": -1.5562270879745483, + "logits/rejected": -1.5387725830078125, + "logps/chosen": -601.1290893554688, + "logps/rejected": -723.7789916992188, + "loss": 0.5382, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8300285339355469, + "rewards/margins": 0.7093777060508728, + "rewards/rejected": -2.5394062995910645, + "step": 514 + }, + { + "epoch": 0.33635398808065964, + "grad_norm": 40.988995422153955, + "learning_rate": 1.2596814089388074e-07, + "logits/chosen": -1.646407127380371, + "logits/rejected": -1.622480869293213, + "logps/chosen": -724.177001953125, + "logps/rejected": -750.5733032226562, + "loss": 0.5101, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.0493416786193848, + "rewards/margins": 0.4716246724128723, + "rewards/rejected": -2.520966053009033, + "step": 515 + }, + { + "epoch": 0.3370071026206221, + "grad_norm": 22.872898048232333, + "learning_rate": 1.2584248053765463e-07, + "logits/chosen": -1.668154239654541, + "logits/rejected": -1.6877217292785645, + "logps/chosen": -788.1798095703125, + "logps/rejected": -882.4114990234375, + "loss": 0.5467, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5617947578430176, + "rewards/margins": 0.7210710048675537, + "rewards/rejected": -3.2828657627105713, + "step": 516 + }, + { + "epoch": 0.33766021716058453, + "grad_norm": 62.010832881435796, + "learning_rate": 1.257165555396494e-07, + "logits/chosen": -1.6311254501342773, + "logits/rejected": -1.603393793106079, + "logps/chosen": -683.9799194335938, + "logps/rejected": -818.5030517578125, + "loss": 0.5689, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8681172132492065, + "rewards/margins": 0.7638627290725708, + "rewards/rejected": -2.6319799423217773, + "step": 517 + }, + { + "epoch": 0.338313331700547, + "grad_norm": 47.93407261910419, + "learning_rate": 1.2559036655532116e-07, + "logits/chosen": -1.639786958694458, + "logits/rejected": -1.6281380653381348, + "logps/chosen": -655.11083984375, + "logps/rejected": -711.305419921875, + "loss": 0.6042, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.9371042251586914, + "rewards/margins": 0.29598766565322876, + "rewards/rejected": -2.2330920696258545, + "step": 518 + }, + { + "epoch": 0.3389664462405094, + "grad_norm": 8.481962150515601, + "learning_rate": 1.2546391424150015e-07, + "logits/chosen": -1.5316431522369385, + "logits/rejected": -1.5820224285125732, + "logps/chosen": -723.28173828125, + "logps/rejected": -765.1573486328125, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.011814594268799, + "rewards/margins": 0.4189469814300537, + "rewards/rejected": -2.4307615756988525, + "step": 519 + }, + { + "epoch": 0.3396195607804719, + "grad_norm": 16.110076519841456, + "learning_rate": 1.2533719925638722e-07, + "logits/chosen": -1.561875820159912, + "logits/rejected": -1.563469409942627, + "logps/chosen": -703.0348510742188, + "logps/rejected": -769.7527465820312, + "loss": 0.5457, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9231442213058472, + "rewards/margins": 0.762036144733429, + "rewards/rejected": -2.685180187225342, + "step": 520 + }, + { + "epoch": 0.3402726753204343, + "grad_norm": 12.232146508628508, + "learning_rate": 1.2521022225955051e-07, + "logits/chosen": -1.6638747453689575, + "logits/rejected": -1.675110101699829, + "logps/chosen": -702.4622802734375, + "logps/rejected": -841.2349853515625, + "loss": 0.5551, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.236776351928711, + "rewards/margins": 0.7252943515777588, + "rewards/rejected": -2.962070941925049, + "step": 521 + }, + { + "epoch": 0.3409257898603968, + "grad_norm": 20.099246846498655, + "learning_rate": 1.2508298391192192e-07, + "logits/chosen": -1.6344259977340698, + "logits/rejected": -1.6060516834259033, + "logps/chosen": -690.3515625, + "logps/rejected": -725.8463134765625, + "loss": 0.5615, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1555328369140625, + "rewards/margins": 0.24002605676651, + "rewards/rejected": -2.3955588340759277, + "step": 522 + }, + { + "epoch": 0.3415789044003592, + "grad_norm": 28.678293250926668, + "learning_rate": 1.2495548487579377e-07, + "logits/chosen": -1.6345038414001465, + "logits/rejected": -1.6450937986373901, + "logps/chosen": -744.6649780273438, + "logps/rejected": -786.7367553710938, + "loss": 0.561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3111023902893066, + "rewards/margins": 0.4379115700721741, + "rewards/rejected": -2.749014377593994, + "step": 523 + }, + { + "epoch": 0.3422320189403217, + "grad_norm": 55.26079414913828, + "learning_rate": 1.248277258148152e-07, + "logits/chosen": -1.632555365562439, + "logits/rejected": -1.6509549617767334, + "logps/chosen": -782.6264038085938, + "logps/rejected": -826.6570434570312, + "loss": 0.5775, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1081671714782715, + "rewards/margins": 0.5822569131851196, + "rewards/rejected": -2.6904244422912598, + "step": 524 + }, + { + "epoch": 0.3428851334802841, + "grad_norm": 25.386727503372587, + "learning_rate": 1.2469970739398895e-07, + "logits/chosen": -1.5985530614852905, + "logits/rejected": -1.6135672330856323, + "logps/chosen": -738.0941772460938, + "logps/rejected": -705.9267578125, + "loss": 0.6116, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2364561557769775, + "rewards/margins": 0.3205564022064209, + "rewards/rejected": -2.5570127964019775, + "step": 525 + }, + { + "epoch": 0.3435382480202466, + "grad_norm": 24.51284417618824, + "learning_rate": 1.2457143027966763e-07, + "logits/chosen": -1.634655475616455, + "logits/rejected": -1.6211440563201904, + "logps/chosen": -750.4459228515625, + "logps/rejected": -730.4688720703125, + "loss": 0.5521, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.091447353363037, + "rewards/margins": 0.35747280716896057, + "rewards/rejected": -2.448920249938965, + "step": 526 + }, + { + "epoch": 0.344191362560209, + "grad_norm": 44.66836450950333, + "learning_rate": 1.2444289513955052e-07, + "logits/chosen": -1.5693323612213135, + "logits/rejected": -1.6260815858840942, + "logps/chosen": -779.1264038085938, + "logps/rejected": -830.4940185546875, + "loss": 0.5846, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.496044635772705, + "rewards/margins": 0.4798283278942108, + "rewards/rejected": -2.975872755050659, + "step": 527 + }, + { + "epoch": 0.34484447710017146, + "grad_norm": 58.69879433030782, + "learning_rate": 1.2431410264267977e-07, + "logits/chosen": -1.604842185974121, + "logits/rejected": -1.604213833808899, + "logps/chosen": -802.912109375, + "logps/rejected": -868.535888671875, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3447303771972656, + "rewards/margins": 0.5476406216621399, + "rewards/rejected": -2.8923707008361816, + "step": 528 + }, + { + "epoch": 0.3454975916401339, + "grad_norm": 36.25638810204293, + "learning_rate": 1.2418505345943732e-07, + "logits/chosen": -1.5913817882537842, + "logits/rejected": -1.631279468536377, + "logps/chosen": -760.42138671875, + "logps/rejected": -691.488037109375, + "loss": 0.582, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.126579999923706, + "rewards/margins": 0.14462479948997498, + "rewards/rejected": -2.2712044715881348, + "step": 529 + }, + { + "epoch": 0.34615070618009636, + "grad_norm": 22.96730047945058, + "learning_rate": 1.24055748261541e-07, + "logits/chosen": -1.6423602104187012, + "logits/rejected": -1.6510244607925415, + "logps/chosen": -752.2402954101562, + "logps/rejected": -776.55810546875, + "loss": 0.5723, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1990549564361572, + "rewards/margins": 0.4823690950870514, + "rewards/rejected": -2.681424140930176, + "step": 530 + }, + { + "epoch": 0.3468038207200588, + "grad_norm": 12.704637412947715, + "learning_rate": 1.2392618772204144e-07, + "logits/chosen": -1.5454316139221191, + "logits/rejected": -1.5524265766143799, + "logps/chosen": -704.6710815429688, + "logps/rejected": -745.8167114257812, + "loss": 0.5883, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.09757924079895, + "rewards/margins": 0.410552978515625, + "rewards/rejected": -2.508132219314575, + "step": 531 + }, + { + "epoch": 0.34745693526002125, + "grad_norm": 39.764198564369345, + "learning_rate": 1.2379637251531815e-07, + "logits/chosen": -1.6019591093063354, + "logits/rejected": -1.6283773183822632, + "logps/chosen": -743.0405883789062, + "logps/rejected": -784.7008666992188, + "loss": 0.5559, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1056554317474365, + "rewards/margins": 0.5337550044059753, + "rewards/rejected": -2.6394104957580566, + "step": 532 + }, + { + "epoch": 0.34811004979998367, + "grad_norm": 34.128906613498735, + "learning_rate": 1.2366630331707633e-07, + "logits/chosen": -1.6114816665649414, + "logits/rejected": -1.6253386735916138, + "logps/chosen": -733.2994995117188, + "logps/rejected": -764.6762084960938, + "loss": 0.5313, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0889058113098145, + "rewards/margins": 0.47615137696266174, + "rewards/rejected": -2.5650570392608643, + "step": 533 + }, + { + "epoch": 0.34876316433994614, + "grad_norm": 10.132934728994826, + "learning_rate": 1.2353598080434324e-07, + "logits/chosen": -1.632277250289917, + "logits/rejected": -1.6106319427490234, + "logps/chosen": -824.1144409179688, + "logps/rejected": -907.9232177734375, + "loss": 0.556, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.394355297088623, + "rewards/margins": 0.591341495513916, + "rewards/rejected": -2.985696792602539, + "step": 534 + }, + { + "epoch": 0.34941627887990856, + "grad_norm": 27.01945814074106, + "learning_rate": 1.234054056554646e-07, + "logits/chosen": -1.5513118505477905, + "logits/rejected": -1.5469458103179932, + "logps/chosen": -851.3035278320312, + "logps/rejected": -767.6931762695312, + "loss": 0.598, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3710482120513916, + "rewards/margins": 0.34137019515037537, + "rewards/rejected": -2.7124183177948, + "step": 535 + }, + { + "epoch": 0.35006939341987103, + "grad_norm": 20.633383781514684, + "learning_rate": 1.2327457855010123e-07, + "logits/chosen": -1.5809993743896484, + "logits/rejected": -1.5902115106582642, + "logps/chosen": -829.6868896484375, + "logps/rejected": -818.66455078125, + "loss": 0.5935, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.530834913253784, + "rewards/margins": 0.49049264192581177, + "rewards/rejected": -3.021327495574951, + "step": 536 + }, + { + "epoch": 0.35072250795983345, + "grad_norm": 8.274651730353373, + "learning_rate": 1.2314350016922534e-07, + "logits/chosen": -1.634692907333374, + "logits/rejected": -1.593540072441101, + "logps/chosen": -737.755126953125, + "logps/rejected": -739.923583984375, + "loss": 0.5132, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.248922348022461, + "rewards/margins": 0.5101175308227539, + "rewards/rejected": -2.759039878845215, + "step": 537 + }, + { + "epoch": 0.3513756224997959, + "grad_norm": 10.192458514719585, + "learning_rate": 1.2301217119511708e-07, + "logits/chosen": -1.5304259061813354, + "logits/rejected": -1.5202865600585938, + "logps/chosen": -765.9959106445312, + "logps/rejected": -765.4609375, + "loss": 0.5613, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.386655330657959, + "rewards/margins": 0.377580851316452, + "rewards/rejected": -2.7642359733581543, + "step": 538 + }, + { + "epoch": 0.35202873703975834, + "grad_norm": 34.040881919948525, + "learning_rate": 1.2288059231136108e-07, + "logits/chosen": -1.532372236251831, + "logits/rejected": -1.5646611452102661, + "logps/chosen": -741.4403686523438, + "logps/rejected": -813.6516723632812, + "loss": 0.5147, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.1854164600372314, + "rewards/margins": 0.7029819488525391, + "rewards/rejected": -2.8883986473083496, + "step": 539 + }, + { + "epoch": 0.3526818515797208, + "grad_norm": 12.432751143916052, + "learning_rate": 1.2274876420284258e-07, + "logits/chosen": -1.6108044385910034, + "logits/rejected": -1.6093095541000366, + "logps/chosen": -760.4302978515625, + "logps/rejected": -788.9061889648438, + "loss": 0.5564, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.301989793777466, + "rewards/margins": 0.3381144404411316, + "rewards/rejected": -2.640104293823242, + "step": 540 + }, + { + "epoch": 0.35333496611968324, + "grad_norm": 30.749409766082465, + "learning_rate": 1.2261668755574421e-07, + "logits/chosen": -1.5781821012496948, + "logits/rejected": -1.6185979843139648, + "logps/chosen": -819.251220703125, + "logps/rejected": -834.0662841796875, + "loss": 0.5351, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3528733253479004, + "rewards/margins": 0.5704101324081421, + "rewards/rejected": -2.923283338546753, + "step": 541 + }, + { + "epoch": 0.3539880806596457, + "grad_norm": 29.16774400285802, + "learning_rate": 1.2248436305754222e-07, + "logits/chosen": -1.648099422454834, + "logits/rejected": -1.6576378345489502, + "logps/chosen": -712.8369140625, + "logps/rejected": -755.3017578125, + "loss": 0.5369, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.99370276927948, + "rewards/margins": 0.4913850426673889, + "rewards/rejected": -2.4850876331329346, + "step": 542 + }, + { + "epoch": 0.35464119519960813, + "grad_norm": 20.69027911099315, + "learning_rate": 1.2235179139700304e-07, + "logits/chosen": -1.6414737701416016, + "logits/rejected": -1.624894142150879, + "logps/chosen": -813.76611328125, + "logps/rejected": -831.1404418945312, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3536837100982666, + "rewards/margins": 0.42595234513282776, + "rewards/rejected": -2.7796361446380615, + "step": 543 + }, + { + "epoch": 0.3552943097395706, + "grad_norm": 29.949634840397668, + "learning_rate": 1.222189732641795e-07, + "logits/chosen": -1.5087977647781372, + "logits/rejected": -1.4944639205932617, + "logps/chosen": -627.0595703125, + "logps/rejected": -755.531005859375, + "loss": 0.5464, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.9218859672546387, + "rewards/margins": 1.039351463317871, + "rewards/rejected": -2.9612374305725098, + "step": 544 + }, + { + "epoch": 0.355947424279533, + "grad_norm": 14.943355173162043, + "learning_rate": 1.220859093504074e-07, + "logits/chosen": -1.5890822410583496, + "logits/rejected": -1.5994880199432373, + "logps/chosen": -857.3690185546875, + "logps/rejected": -871.0678100585938, + "loss": 0.5952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6291110515594482, + "rewards/margins": 0.46089449524879456, + "rewards/rejected": -3.09000563621521, + "step": 545 + }, + { + "epoch": 0.3566005388194955, + "grad_norm": 15.565475336026891, + "learning_rate": 1.2195260034830187e-07, + "logits/chosen": -1.6251795291900635, + "logits/rejected": -1.6245146989822388, + "logps/chosen": -726.9799194335938, + "logps/rejected": -732.9411010742188, + "loss": 0.5526, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.323512077331543, + "rewards/margins": 0.2341621220111847, + "rewards/rejected": -2.5576741695404053, + "step": 546 + }, + { + "epoch": 0.3572536533594579, + "grad_norm": 23.991791905870468, + "learning_rate": 1.2181904695175374e-07, + "logits/chosen": -1.4712308645248413, + "logits/rejected": -1.4625868797302246, + "logps/chosen": -717.213134765625, + "logps/rejected": -820.203857421875, + "loss": 0.6012, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.27494740486145, + "rewards/margins": 0.7880808115005493, + "rewards/rejected": -3.063028335571289, + "step": 547 + }, + { + "epoch": 0.3579067678994204, + "grad_norm": 35.7010085328891, + "learning_rate": 1.2168524985592597e-07, + "logits/chosen": -1.5415699481964111, + "logits/rejected": -1.5597350597381592, + "logps/chosen": -772.77978515625, + "logps/rejected": -746.6201782226562, + "loss": 0.5508, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.421182155609131, + "rewards/margins": 0.254733145236969, + "rewards/rejected": -2.675915241241455, + "step": 548 + }, + { + "epoch": 0.3585598824393828, + "grad_norm": 14.830303633870002, + "learning_rate": 1.2155120975724996e-07, + "logits/chosen": -1.5214571952819824, + "logits/rejected": -1.5628215074539185, + "logps/chosen": -706.9215087890625, + "logps/rejected": -791.664794921875, + "loss": 0.5483, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0701212882995605, + "rewards/margins": 0.7649804949760437, + "rewards/rejected": -2.83510160446167, + "step": 549 + }, + { + "epoch": 0.3592129969793453, + "grad_norm": 11.60924722942488, + "learning_rate": 1.214169273534221e-07, + "logits/chosen": -1.5903964042663574, + "logits/rejected": -1.6238888502120972, + "logps/chosen": -754.437255859375, + "logps/rejected": -765.71435546875, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2985119819641113, + "rewards/margins": 0.33853238821029663, + "rewards/rejected": -2.6370441913604736, + "step": 550 + }, + { + "epoch": 0.3598661115193077, + "grad_norm": 19.208616054146393, + "learning_rate": 1.2128240334339978e-07, + "logits/chosen": -1.6252080202102661, + "logits/rejected": -1.6387659311294556, + "logps/chosen": -827.2951049804688, + "logps/rejected": -865.0391845703125, + "loss": 0.5597, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.646684408187866, + "rewards/margins": 0.5018330812454224, + "rewards/rejected": -3.14851713180542, + "step": 551 + }, + { + "epoch": 0.36051922605927017, + "grad_norm": 15.97827516501005, + "learning_rate": 1.211476384273982e-07, + "logits/chosen": -1.6076312065124512, + "logits/rejected": -1.6076362133026123, + "logps/chosen": -735.5797119140625, + "logps/rejected": -755.5748291015625, + "loss": 0.5435, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.272061347961426, + "rewards/margins": 0.8418019413948059, + "rewards/rejected": -3.113863229751587, + "step": 552 + }, + { + "epoch": 0.3611723405992326, + "grad_norm": 12.277648543582611, + "learning_rate": 1.2101263330688638e-07, + "logits/chosen": -1.6942815780639648, + "logits/rejected": -1.6733278036117554, + "logps/chosen": -777.0508422851562, + "logps/rejected": -763.1177368164062, + "loss": 0.5924, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.428075075149536, + "rewards/margins": 0.363750159740448, + "rewards/rejected": -2.79182505607605, + "step": 553 + }, + { + "epoch": 0.36182545513919506, + "grad_norm": 10.465467914806046, + "learning_rate": 1.208773886845837e-07, + "logits/chosen": -1.604886531829834, + "logits/rejected": -1.6047513484954834, + "logps/chosen": -739.2161865234375, + "logps/rejected": -860.6815185546875, + "loss": 0.511, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.230139970779419, + "rewards/margins": 0.676809549331665, + "rewards/rejected": -2.906949520111084, + "step": 554 + }, + { + "epoch": 0.3624785696791575, + "grad_norm": 21.364165430156397, + "learning_rate": 1.2074190526445616e-07, + "logits/chosen": -1.6185420751571655, + "logits/rejected": -1.6467700004577637, + "logps/chosen": -812.8031005859375, + "logps/rejected": -845.0140991210938, + "loss": 0.4957, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4488282203674316, + "rewards/margins": 0.606843113899231, + "rewards/rejected": -3.055671453475952, + "step": 555 + }, + { + "epoch": 0.36313168421911995, + "grad_norm": 12.060857675840488, + "learning_rate": 1.2060618375171275e-07, + "logits/chosen": -1.582899808883667, + "logits/rejected": -1.5877315998077393, + "logps/chosen": -716.4563598632812, + "logps/rejected": -797.8826293945312, + "loss": 0.5325, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.498225688934326, + "rewards/margins": 0.3760990798473358, + "rewards/rejected": -2.8743247985839844, + "step": 556 + }, + { + "epoch": 0.3637847987590824, + "grad_norm": 13.492857611613953, + "learning_rate": 1.2047022485280168e-07, + "logits/chosen": -1.539080023765564, + "logits/rejected": -1.570099115371704, + "logps/chosen": -700.08203125, + "logps/rejected": -716.7232666015625, + "loss": 0.5849, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2837417125701904, + "rewards/margins": 0.5059683322906494, + "rewards/rejected": -2.78971004486084, + "step": 557 + }, + { + "epoch": 0.36443791329904485, + "grad_norm": 33.571052467052354, + "learning_rate": 1.2033402927540688e-07, + "logits/chosen": -1.5120295286178589, + "logits/rejected": -1.4916774034500122, + "logps/chosen": -786.6419677734375, + "logps/rejected": -877.5504760742188, + "loss": 0.5212, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.3839404582977295, + "rewards/margins": 0.8281471133232117, + "rewards/rejected": -3.212087631225586, + "step": 558 + }, + { + "epoch": 0.36509102783900726, + "grad_norm": 49.87336711471519, + "learning_rate": 1.2019759772844423e-07, + "logits/chosen": -1.570830225944519, + "logits/rejected": -1.612029790878296, + "logps/chosen": -759.057373046875, + "logps/rejected": -869.3032836914062, + "loss": 0.608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1914572715759277, + "rewards/margins": 0.8840473890304565, + "rewards/rejected": -3.075504779815674, + "step": 559 + }, + { + "epoch": 0.36574414237896974, + "grad_norm": 74.5712473751751, + "learning_rate": 1.2006093092205777e-07, + "logits/chosen": -1.4767194986343384, + "logits/rejected": -1.4949413537979126, + "logps/chosen": -698.3121948242188, + "logps/rejected": -743.3089599609375, + "loss": 0.5547, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.0461912155151367, + "rewards/margins": 0.5971631407737732, + "rewards/rejected": -2.643354654312134, + "step": 560 + }, + { + "epoch": 0.36639725691893216, + "grad_norm": 44.74846351811676, + "learning_rate": 1.199240295676162e-07, + "logits/chosen": -1.5869390964508057, + "logits/rejected": -1.600525140762329, + "logps/chosen": -759.1281127929688, + "logps/rejected": -777.5281372070312, + "loss": 0.6048, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.285490036010742, + "rewards/margins": 0.29206350445747375, + "rewards/rejected": -2.5775535106658936, + "step": 561 + }, + { + "epoch": 0.36705037145889463, + "grad_norm": 10.999921054518422, + "learning_rate": 1.1978689437770896e-07, + "logits/chosen": -1.5283629894256592, + "logits/rejected": -1.5792925357818604, + "logps/chosen": -674.9652709960938, + "logps/rejected": -813.370849609375, + "loss": 0.5122, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0200095176696777, + "rewards/margins": 0.8305485844612122, + "rewards/rejected": -2.850558280944824, + "step": 562 + }, + { + "epoch": 0.36770348599885705, + "grad_norm": 55.10455252064543, + "learning_rate": 1.1964952606614276e-07, + "logits/chosen": -1.6453619003295898, + "logits/rejected": -1.6702425479888916, + "logps/chosen": -794.0926513671875, + "logps/rejected": -870.9822998046875, + "loss": 0.5045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6258649826049805, + "rewards/margins": 0.731311559677124, + "rewards/rejected": -3.3571763038635254, + "step": 563 + }, + { + "epoch": 0.3683566005388195, + "grad_norm": 24.477947303915716, + "learning_rate": 1.1951192534793764e-07, + "logits/chosen": -1.5885945558547974, + "logits/rejected": -1.5889862775802612, + "logps/chosen": -740.5401611328125, + "logps/rejected": -770.5545654296875, + "loss": 0.5242, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3373959064483643, + "rewards/margins": 0.3964667022228241, + "rewards/rejected": -2.7338626384735107, + "step": 564 + }, + { + "epoch": 0.36900971507878194, + "grad_norm": 18.91974104462604, + "learning_rate": 1.193740929393234e-07, + "logits/chosen": -1.5289589166641235, + "logits/rejected": -1.5133321285247803, + "logps/chosen": -695.8096923828125, + "logps/rejected": -892.4715576171875, + "loss": 0.5767, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.182143211364746, + "rewards/margins": 1.0010870695114136, + "rewards/rejected": -3.18323016166687, + "step": 565 + }, + { + "epoch": 0.36966282961874436, + "grad_norm": 96.53142007828465, + "learning_rate": 1.1923602955773583e-07, + "logits/chosen": -1.585618257522583, + "logits/rejected": -1.580026626586914, + "logps/chosen": -676.3846435546875, + "logps/rejected": -825.5516357421875, + "loss": 0.5286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.03085994720459, + "rewards/margins": 0.9540601968765259, + "rewards/rejected": -2.984920024871826, + "step": 566 + }, + { + "epoch": 0.37031594415870683, + "grad_norm": 9.841984604340173, + "learning_rate": 1.1909773592181287e-07, + "logits/chosen": -1.6656473875045776, + "logits/rejected": -1.6531533002853394, + "logps/chosen": -783.5050659179688, + "logps/rejected": -795.6260986328125, + "loss": 0.5741, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.256021738052368, + "rewards/margins": 0.621117353439331, + "rewards/rejected": -2.877139091491699, + "step": 567 + }, + { + "epoch": 0.37096905869866925, + "grad_norm": 13.260118641189585, + "learning_rate": 1.189592127513911e-07, + "logits/chosen": -1.5660011768341064, + "logits/rejected": -1.5593037605285645, + "logps/chosen": -699.3264770507812, + "logps/rejected": -718.5197143554688, + "loss": 0.5123, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3721680641174316, + "rewards/margins": 0.40681201219558716, + "rewards/rejected": -2.778980255126953, + "step": 568 + }, + { + "epoch": 0.3716221732386317, + "grad_norm": 66.7819908806299, + "learning_rate": 1.1882046076750176e-07, + "logits/chosen": -1.6180167198181152, + "logits/rejected": -1.6349365711212158, + "logps/chosen": -837.523681640625, + "logps/rejected": -804.4877319335938, + "loss": 0.5133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4943768978118896, + "rewards/margins": 0.5837184190750122, + "rewards/rejected": -3.078094959259033, + "step": 569 + }, + { + "epoch": 0.37227528777859414, + "grad_norm": 29.37833169883412, + "learning_rate": 1.186814806923671e-07, + "logits/chosen": -1.5915193557739258, + "logits/rejected": -1.5684032440185547, + "logps/chosen": -738.4844360351562, + "logps/rejected": -740.8411254882812, + "loss": 0.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2804222106933594, + "rewards/margins": 0.37493157386779785, + "rewards/rejected": -2.6553537845611572, + "step": 570 + }, + { + "epoch": 0.3729284023185566, + "grad_norm": 48.10084583273166, + "learning_rate": 1.1854227324939669e-07, + "logits/chosen": -1.612912654876709, + "logits/rejected": -1.5692781209945679, + "logps/chosen": -897.7774658203125, + "logps/rejected": -913.3131103515625, + "loss": 0.5835, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.936650276184082, + "rewards/margins": 0.400768518447876, + "rewards/rejected": -3.337418794631958, + "step": 571 + }, + { + "epoch": 0.37358151685851904, + "grad_norm": 11.716756841277345, + "learning_rate": 1.1840283916318347e-07, + "logits/chosen": -1.5072287321090698, + "logits/rejected": -1.4601385593414307, + "logps/chosen": -745.0501708984375, + "logps/rejected": -801.019775390625, + "loss": 0.5261, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.25480055809021, + "rewards/margins": 0.8588010668754578, + "rewards/rejected": -3.1136016845703125, + "step": 572 + }, + { + "epoch": 0.3742346313984815, + "grad_norm": 18.580715359087044, + "learning_rate": 1.1826317915950021e-07, + "logits/chosen": -1.5958278179168701, + "logits/rejected": -1.6139434576034546, + "logps/chosen": -767.4638061523438, + "logps/rejected": -891.0325927734375, + "loss": 0.5267, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.512181043624878, + "rewards/margins": 0.7721086144447327, + "rewards/rejected": -3.284290075302124, + "step": 573 + }, + { + "epoch": 0.37488774593844393, + "grad_norm": 24.286745313641156, + "learning_rate": 1.181232939652955e-07, + "logits/chosen": -1.5305031538009644, + "logits/rejected": -1.4823194742202759, + "logps/chosen": -748.1857299804688, + "logps/rejected": -847.1156005859375, + "loss": 0.5462, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.5048766136169434, + "rewards/margins": 0.7592265009880066, + "rewards/rejected": -3.2641031742095947, + "step": 574 + }, + { + "epoch": 0.3755408604784064, + "grad_norm": 29.652610575297764, + "learning_rate": 1.1798318430869012e-07, + "logits/chosen": -1.5794718265533447, + "logits/rejected": -1.5874780416488647, + "logps/chosen": -794.8291625976562, + "logps/rejected": -825.0755615234375, + "loss": 0.5033, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.382725238800049, + "rewards/margins": 0.7552610039710999, + "rewards/rejected": -3.137986183166504, + "step": 575 + }, + { + "epoch": 0.3761939750183688, + "grad_norm": 58.912701269068016, + "learning_rate": 1.1784285091897322e-07, + "logits/chosen": -1.6159145832061768, + "logits/rejected": -1.6184728145599365, + "logps/chosen": -892.6289672851562, + "logps/rejected": -935.57568359375, + "loss": 0.5889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7877440452575684, + "rewards/margins": 0.5180625319480896, + "rewards/rejected": -3.3058066368103027, + "step": 576 + }, + { + "epoch": 0.3768470895583313, + "grad_norm": 92.84475707292661, + "learning_rate": 1.1770229452659852e-07, + "logits/chosen": -1.5818086862564087, + "logits/rejected": -1.583407998085022, + "logps/chosen": -678.228515625, + "logps/rejected": -849.5630493164062, + "loss": 0.5562, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2353968620300293, + "rewards/margins": 1.1506848335266113, + "rewards/rejected": -3.386082172393799, + "step": 577 + }, + { + "epoch": 0.3775002040982937, + "grad_norm": 24.53245073109831, + "learning_rate": 1.1756151586318044e-07, + "logits/chosen": -1.5623910427093506, + "logits/rejected": -1.6067014932632446, + "logps/chosen": -806.4697875976562, + "logps/rejected": -860.4945068359375, + "loss": 0.5023, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.364492654800415, + "rewards/margins": 0.7060398459434509, + "rewards/rejected": -3.07053279876709, + "step": 578 + }, + { + "epoch": 0.3781533186382562, + "grad_norm": 18.85494518114228, + "learning_rate": 1.174205156614904e-07, + "logits/chosen": -1.4894192218780518, + "logits/rejected": -1.5204534530639648, + "logps/chosen": -829.0343017578125, + "logps/rejected": -848.0327758789062, + "loss": 0.5509, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7499516010284424, + "rewards/margins": 0.4761306643486023, + "rewards/rejected": -3.2260818481445312, + "step": 579 + }, + { + "epoch": 0.3788064331782186, + "grad_norm": 15.464999889697193, + "learning_rate": 1.1727929465545294e-07, + "logits/chosen": -1.5147112607955933, + "logits/rejected": -1.6048409938812256, + "logps/chosen": -769.5234985351562, + "logps/rejected": -810.0481567382812, + "loss": 0.6104, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7371156215667725, + "rewards/margins": 0.32579949498176575, + "rewards/rejected": -3.062915325164795, + "step": 580 + }, + { + "epoch": 0.3794595477181811, + "grad_norm": 49.00611608700767, + "learning_rate": 1.1713785358014193e-07, + "logits/chosen": -1.603877305984497, + "logits/rejected": -1.6114190816879272, + "logps/chosen": -693.6171875, + "logps/rejected": -696.5467529296875, + "loss": 0.5624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.458164691925049, + "rewards/margins": 0.3706192076206207, + "rewards/rejected": -2.8287839889526367, + "step": 581 + }, + { + "epoch": 0.3801126622581435, + "grad_norm": 41.687356680487866, + "learning_rate": 1.1699619317177668e-07, + "logits/chosen": -1.492578387260437, + "logits/rejected": -1.4849687814712524, + "logps/chosen": -711.063232421875, + "logps/rejected": -779.2449951171875, + "loss": 0.5001, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.399482250213623, + "rewards/margins": 0.7047311663627625, + "rewards/rejected": -3.104212999343872, + "step": 582 + }, + { + "epoch": 0.38076577679810597, + "grad_norm": 40.272642825045395, + "learning_rate": 1.1685431416771825e-07, + "logits/chosen": -1.5852625370025635, + "logits/rejected": -1.6129764318466187, + "logps/chosen": -813.8887939453125, + "logps/rejected": -817.9891357421875, + "loss": 0.5583, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6035213470458984, + "rewards/margins": 0.5876621007919312, + "rewards/rejected": -3.19118332862854, + "step": 583 + }, + { + "epoch": 0.3814188913380684, + "grad_norm": 24.667173577610694, + "learning_rate": 1.1671221730646543e-07, + "logits/chosen": -1.5789833068847656, + "logits/rejected": -1.5653014183044434, + "logps/chosen": -698.559814453125, + "logps/rejected": -799.244384765625, + "loss": 0.5128, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.156489610671997, + "rewards/margins": 0.8030795454978943, + "rewards/rejected": -2.959568977355957, + "step": 584 + }, + { + "epoch": 0.38207200587803086, + "grad_norm": 10.056286338983814, + "learning_rate": 1.1656990332765101e-07, + "logits/chosen": -1.579622745513916, + "logits/rejected": -1.5801726579666138, + "logps/chosen": -727.0596923828125, + "logps/rejected": -783.4789428710938, + "loss": 0.5499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.434051036834717, + "rewards/margins": 0.5680094957351685, + "rewards/rejected": -3.002060651779175, + "step": 585 + }, + { + "epoch": 0.3827251204179933, + "grad_norm": 10.639118531618282, + "learning_rate": 1.1642737297203792e-07, + "logits/chosen": -1.550283432006836, + "logits/rejected": -1.545919418334961, + "logps/chosen": -678.3670043945312, + "logps/rejected": -830.1644897460938, + "loss": 0.5517, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.09962797164917, + "rewards/margins": 0.9918718934059143, + "rewards/rejected": -3.0914998054504395, + "step": 586 + }, + { + "epoch": 0.38337823495795575, + "grad_norm": 20.197574332614796, + "learning_rate": 1.1628462698151538e-07, + "logits/chosen": -1.5244333744049072, + "logits/rejected": -1.5570902824401855, + "logps/chosen": -690.13720703125, + "logps/rejected": -777.196533203125, + "loss": 0.5574, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.285308837890625, + "rewards/margins": 0.6684877872467041, + "rewards/rejected": -2.95379638671875, + "step": 587 + }, + { + "epoch": 0.3840313494979182, + "grad_norm": 66.37641884436981, + "learning_rate": 1.1614166609909498e-07, + "logits/chosen": -1.5359015464782715, + "logits/rejected": -1.546217679977417, + "logps/chosen": -738.7581176757812, + "logps/rejected": -772.0260009765625, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2918949127197266, + "rewards/margins": 0.4515775740146637, + "rewards/rejected": -2.7434725761413574, + "step": 588 + }, + { + "epoch": 0.38468446403788065, + "grad_norm": 13.46028213535803, + "learning_rate": 1.1599849106890683e-07, + "logits/chosen": -1.527596116065979, + "logits/rejected": -1.5285677909851074, + "logps/chosen": -734.326171875, + "logps/rejected": -749.241455078125, + "loss": 0.568, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2978053092956543, + "rewards/margins": 0.3799334466457367, + "rewards/rejected": -2.677738666534424, + "step": 589 + }, + { + "epoch": 0.38533757857784307, + "grad_norm": 22.97104425543304, + "learning_rate": 1.1585510263619577e-07, + "logits/chosen": -1.5912518501281738, + "logits/rejected": -1.5906767845153809, + "logps/chosen": -724.542236328125, + "logps/rejected": -752.7775268554688, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.344860076904297, + "rewards/margins": 0.30979910492897034, + "rewards/rejected": -2.6546592712402344, + "step": 590 + }, + { + "epoch": 0.38599069311780554, + "grad_norm": 10.721634341469683, + "learning_rate": 1.157115015473174e-07, + "logits/chosen": -1.5702768564224243, + "logits/rejected": -1.5887250900268555, + "logps/chosen": -711.1195678710938, + "logps/rejected": -744.3549194335938, + "loss": 0.5678, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.271317958831787, + "rewards/margins": 0.5261479020118713, + "rewards/rejected": -2.7974658012390137, + "step": 591 + }, + { + "epoch": 0.38664380765776796, + "grad_norm": 20.96230357570753, + "learning_rate": 1.155676885497342e-07, + "logits/chosen": -1.587166666984558, + "logits/rejected": -1.5863206386566162, + "logps/chosen": -703.5427856445312, + "logps/rejected": -760.740234375, + "loss": 0.5131, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3268959522247314, + "rewards/margins": 0.47407492995262146, + "rewards/rejected": -2.800971031188965, + "step": 592 + }, + { + "epoch": 0.38729692219773043, + "grad_norm": 19.489795146613808, + "learning_rate": 1.154236643920117e-07, + "logits/chosen": -1.661864161491394, + "logits/rejected": -1.6856908798217773, + "logps/chosen": -724.410400390625, + "logps/rejected": -773.7425537109375, + "loss": 0.5241, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.489246368408203, + "rewards/margins": 0.5750184059143066, + "rewards/rejected": -3.064265012741089, + "step": 593 + }, + { + "epoch": 0.38795003673769285, + "grad_norm": 73.12725772218887, + "learning_rate": 1.1527942982381452e-07, + "logits/chosen": -1.5697174072265625, + "logits/rejected": -1.5859888792037964, + "logps/chosen": -754.874267578125, + "logps/rejected": -791.1784057617188, + "loss": 0.5575, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.533051013946533, + "rewards/margins": 0.5103416442871094, + "rewards/rejected": -3.0433928966522217, + "step": 594 + }, + { + "epoch": 0.3886031512776553, + "grad_norm": 50.458839536472475, + "learning_rate": 1.1513498559590252e-07, + "logits/chosen": -1.6159231662750244, + "logits/rejected": -1.5974745750427246, + "logps/chosen": -762.911376953125, + "logps/rejected": -743.5185546875, + "loss": 0.5855, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3125293254852295, + "rewards/margins": 0.30350756645202637, + "rewards/rejected": -2.616036891937256, + "step": 595 + }, + { + "epoch": 0.38925626581761774, + "grad_norm": 10.34566924087224, + "learning_rate": 1.1499033246012685e-07, + "logits/chosen": -1.5862224102020264, + "logits/rejected": -1.5608221292495728, + "logps/chosen": -864.4752807617188, + "logps/rejected": -983.65087890625, + "loss": 0.5411, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.47229266166687, + "rewards/margins": 0.9073572754859924, + "rewards/rejected": -3.379650115966797, + "step": 596 + }, + { + "epoch": 0.3899093803575802, + "grad_norm": 27.551259833581323, + "learning_rate": 1.1484547116942601e-07, + "logits/chosen": -1.6429082155227661, + "logits/rejected": -1.6100890636444092, + "logps/chosen": -772.03076171875, + "logps/rejected": -794.9556884765625, + "loss": 0.551, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.502126693725586, + "rewards/margins": 0.5208601951599121, + "rewards/rejected": -3.022986888885498, + "step": 597 + }, + { + "epoch": 0.39056249489754263, + "grad_norm": 21.20639182270992, + "learning_rate": 1.1470040247782205e-07, + "logits/chosen": -1.5553147792816162, + "logits/rejected": -1.5810749530792236, + "logps/chosen": -782.7449951171875, + "logps/rejected": -797.2606201171875, + "loss": 0.5487, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.488363742828369, + "rewards/margins": 0.4950363039970398, + "rewards/rejected": -2.983400344848633, + "step": 598 + }, + { + "epoch": 0.3912156094375051, + "grad_norm": 61.94478857020836, + "learning_rate": 1.1455512714041656e-07, + "logits/chosen": -1.567291021347046, + "logits/rejected": -1.5616893768310547, + "logps/chosen": -682.39306640625, + "logps/rejected": -779.3341674804688, + "loss": 0.5335, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.248433828353882, + "rewards/margins": 0.45859482884407043, + "rewards/rejected": -2.707028865814209, + "step": 599 + }, + { + "epoch": 0.3918687239774675, + "grad_norm": 27.096742126205903, + "learning_rate": 1.1440964591338669e-07, + "logits/chosen": -1.5842782258987427, + "logits/rejected": -1.588181734085083, + "logps/chosen": -848.2506713867188, + "logps/rejected": -845.3145141601562, + "loss": 0.5896, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.536600112915039, + "rewards/margins": 0.5013671517372131, + "rewards/rejected": -3.0379669666290283, + "step": 600 + }, + { + "epoch": 0.3918687239774675, + "eval_logits/chosen": -1.584659457206726, + "eval_logits/rejected": -1.5836009979248047, + "eval_logps/chosen": -748.0584106445312, + "eval_logps/rejected": -799.712158203125, + "eval_loss": 0.5452645421028137, + "eval_rewards/accuracies": 0.7179999947547913, + "eval_rewards/chosen": -2.3771302700042725, + "eval_rewards/margins": 0.5975968241691589, + "eval_rewards/rejected": -2.9747273921966553, + "eval_runtime": 300.4063, + "eval_samples_per_second": 13.315, + "eval_steps_per_second": 0.832, + "step": 600 + }, + { + "epoch": 0.39252183851743, + "grad_norm": 12.348978382810378, + "learning_rate": 1.142639595539813e-07, + "logits/chosen": -1.5698190927505493, + "logits/rejected": -1.529393196105957, + "logps/chosen": -787.7490234375, + "logps/rejected": -788.6868896484375, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.350584030151367, + "rewards/margins": 0.6392349004745483, + "rewards/rejected": -2.989819049835205, + "step": 601 + }, + { + "epoch": 0.3931749530573924, + "grad_norm": 10.679850846766614, + "learning_rate": 1.1411806882051702e-07, + "logits/chosen": -1.5929275751113892, + "logits/rejected": -1.5805671215057373, + "logps/chosen": -723.1961669921875, + "logps/rejected": -713.7118530273438, + "loss": 0.5297, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.31701397895813, + "rewards/margins": 0.5196025371551514, + "rewards/rejected": -2.836616039276123, + "step": 602 + }, + { + "epoch": 0.3938280675973549, + "grad_norm": 35.708189094404084, + "learning_rate": 1.1397197447237423e-07, + "logits/chosen": -1.5436877012252808, + "logits/rejected": -1.562942624092102, + "logps/chosen": -728.4673461914062, + "logps/rejected": -798.8472290039062, + "loss": 0.5137, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6131293773651123, + "rewards/margins": 0.4990772604942322, + "rewards/rejected": -3.11220645904541, + "step": 603 + }, + { + "epoch": 0.3944811821373173, + "grad_norm": 82.94052901290438, + "learning_rate": 1.1382567726999319e-07, + "logits/chosen": -1.5964429378509521, + "logits/rejected": -1.5941182374954224, + "logps/chosen": -796.7896118164062, + "logps/rejected": -837.9287109375, + "loss": 0.5751, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.377279043197632, + "rewards/margins": 0.43824002146720886, + "rewards/rejected": -2.815519094467163, + "step": 604 + }, + { + "epoch": 0.3951342966772798, + "grad_norm": 48.03054462615425, + "learning_rate": 1.1367917797487002e-07, + "logits/chosen": -1.5520787239074707, + "logits/rejected": -1.5235430002212524, + "logps/chosen": -703.1958618164062, + "logps/rejected": -713.5035400390625, + "loss": 0.6487, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.3644354343414307, + "rewards/margins": 0.27925288677215576, + "rewards/rejected": -2.643688440322876, + "step": 605 + }, + { + "epoch": 0.3957874112172422, + "grad_norm": 32.26597426964153, + "learning_rate": 1.1353247734955275e-07, + "logits/chosen": -1.527002215385437, + "logits/rejected": -1.5140247344970703, + "logps/chosen": -783.9526977539062, + "logps/rejected": -775.091796875, + "loss": 0.5689, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.74333119392395, + "rewards/margins": 0.22180992364883423, + "rewards/rejected": -2.9651412963867188, + "step": 606 + }, + { + "epoch": 0.3964405257572047, + "grad_norm": 10.429513069602487, + "learning_rate": 1.133855761576374e-07, + "logits/chosen": -1.5365808010101318, + "logits/rejected": -1.5101385116577148, + "logps/chosen": -651.0933837890625, + "logps/rejected": -708.3961181640625, + "loss": 0.5409, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9835056066513062, + "rewards/margins": 0.3382415473461151, + "rewards/rejected": -2.321747303009033, + "step": 607 + }, + { + "epoch": 0.3970936402971671, + "grad_norm": 19.72537569851452, + "learning_rate": 1.1323847516376392e-07, + "logits/chosen": -1.5134871006011963, + "logits/rejected": -1.545701026916504, + "logps/chosen": -684.7098999023438, + "logps/rejected": -666.0336303710938, + "loss": 0.5726, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.495856523513794, + "rewards/margins": 0.06803350895643234, + "rewards/rejected": -2.563889980316162, + "step": 608 + }, + { + "epoch": 0.39774675483712957, + "grad_norm": 78.12335940886078, + "learning_rate": 1.1309117513361228e-07, + "logits/chosen": -1.4830129146575928, + "logits/rejected": -1.4823014736175537, + "logps/chosen": -758.7376708984375, + "logps/rejected": -757.42138671875, + "loss": 0.5917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4975523948669434, + "rewards/margins": 0.32604148983955383, + "rewards/rejected": -2.823594093322754, + "step": 609 + }, + { + "epoch": 0.398399869377092, + "grad_norm": 23.094040308095767, + "learning_rate": 1.1294367683389848e-07, + "logits/chosen": -1.5845017433166504, + "logits/rejected": -1.556647539138794, + "logps/chosen": -739.6962280273438, + "logps/rejected": -748.73876953125, + "loss": 0.5258, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4809250831604004, + "rewards/margins": 0.4438374638557434, + "rewards/rejected": -2.924762725830078, + "step": 610 + }, + { + "epoch": 0.39905298391705446, + "grad_norm": 39.1177725554315, + "learning_rate": 1.1279598103237047e-07, + "logits/chosen": -1.60092031955719, + "logits/rejected": -1.6122784614562988, + "logps/chosen": -759.567138671875, + "logps/rejected": -808.7623291015625, + "loss": 0.5337, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.235379695892334, + "rewards/margins": 0.5821799039840698, + "rewards/rejected": -2.8175594806671143, + "step": 611 + }, + { + "epoch": 0.3997060984570169, + "grad_norm": 14.877552981780648, + "learning_rate": 1.1264808849780429e-07, + "logits/chosen": -1.5760529041290283, + "logits/rejected": -1.5651963949203491, + "logps/chosen": -779.1673583984375, + "logps/rejected": -973.266845703125, + "loss": 0.5072, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3549463748931885, + "rewards/margins": 1.1115797758102417, + "rewards/rejected": -3.4665260314941406, + "step": 612 + }, + { + "epoch": 0.40035921299697935, + "grad_norm": 38.151466897675625, + "learning_rate": 1.125e-07, + "logits/chosen": -1.5994532108306885, + "logits/rejected": -1.62065589427948, + "logps/chosen": -740.2821655273438, + "logps/rejected": -788.3598022460938, + "loss": 0.5041, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.354248046875, + "rewards/margins": 0.6767839193344116, + "rewards/rejected": -3.031032085418701, + "step": 613 + }, + { + "epoch": 0.40101232753694177, + "grad_norm": 31.35161543065443, + "learning_rate": 1.123517163097776e-07, + "logits/chosen": -1.6029822826385498, + "logits/rejected": -1.6242347955703735, + "logps/chosen": -775.5657348632812, + "logps/rejected": -801.442138671875, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5471549034118652, + "rewards/margins": 0.6026932001113892, + "rewards/rejected": -3.149848222732544, + "step": 614 + }, + { + "epoch": 0.40166544207690424, + "grad_norm": 42.24561626020638, + "learning_rate": 1.1220323819897319e-07, + "logits/chosen": -1.525698184967041, + "logits/rejected": -1.528663992881775, + "logps/chosen": -715.5992431640625, + "logps/rejected": -701.003173828125, + "loss": 0.5406, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.155259609222412, + "rewards/margins": 0.44246095418930054, + "rewards/rejected": -2.5977203845977783, + "step": 615 + }, + { + "epoch": 0.40231855661686666, + "grad_norm": 43.53559559563413, + "learning_rate": 1.120545664404348e-07, + "logits/chosen": -1.5581973791122437, + "logits/rejected": -1.511064052581787, + "logps/chosen": -835.9921264648438, + "logps/rejected": -890.62744140625, + "loss": 0.4782, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.300487518310547, + "rewards/margins": 0.912307620048523, + "rewards/rejected": -3.2127950191497803, + "step": 616 + }, + { + "epoch": 0.40297167115682914, + "grad_norm": 22.99506036909689, + "learning_rate": 1.1190570180801842e-07, + "logits/chosen": -1.5422519445419312, + "logits/rejected": -1.580669641494751, + "logps/chosen": -782.707763671875, + "logps/rejected": -868.0604858398438, + "loss": 0.5118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.45739483833313, + "rewards/margins": 0.641864538192749, + "rewards/rejected": -3.0992591381073, + "step": 617 + }, + { + "epoch": 0.40362478569679155, + "grad_norm": 9.096921333008568, + "learning_rate": 1.11756645076584e-07, + "logits/chosen": -1.5296615362167358, + "logits/rejected": -1.5493112802505493, + "logps/chosen": -742.4532470703125, + "logps/rejected": -760.0758666992188, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1916160583496094, + "rewards/margins": 0.4671185612678528, + "rewards/rejected": -2.6587345600128174, + "step": 618 + }, + { + "epoch": 0.40427790023675403, + "grad_norm": 19.544727760189303, + "learning_rate": 1.1160739702199136e-07, + "logits/chosen": -1.5654833316802979, + "logits/rejected": -1.6011905670166016, + "logps/chosen": -678.881103515625, + "logps/rejected": -688.1021728515625, + "loss": 0.605, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.180164098739624, + "rewards/margins": 0.3665446937084198, + "rewards/rejected": -2.546708583831787, + "step": 619 + }, + { + "epoch": 0.40493101477671645, + "grad_norm": 33.42410909057005, + "learning_rate": 1.1145795842109621e-07, + "logits/chosen": -1.4947848320007324, + "logits/rejected": -1.5099647045135498, + "logps/chosen": -757.1992797851562, + "logps/rejected": -829.0934448242188, + "loss": 0.6061, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2935428619384766, + "rewards/margins": 0.46146607398986816, + "rewards/rejected": -2.7550089359283447, + "step": 620 + }, + { + "epoch": 0.4055841293166789, + "grad_norm": 18.79594230641745, + "learning_rate": 1.1130833005174605e-07, + "logits/chosen": -1.5316284894943237, + "logits/rejected": -1.5386699438095093, + "logps/chosen": -728.5615844726562, + "logps/rejected": -749.1942138671875, + "loss": 0.5481, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.2535488605499268, + "rewards/margins": 0.5478615760803223, + "rewards/rejected": -2.80141019821167, + "step": 621 + }, + { + "epoch": 0.40623724385664134, + "grad_norm": 8.286798076996332, + "learning_rate": 1.1115851269277616e-07, + "logits/chosen": -1.6122698783874512, + "logits/rejected": -1.6200590133666992, + "logps/chosen": -774.5802001953125, + "logps/rejected": -867.3875732421875, + "loss": 0.5414, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.146087884902954, + "rewards/margins": 0.6804053783416748, + "rewards/rejected": -2.82649302482605, + "step": 622 + }, + { + "epoch": 0.4068903583966038, + "grad_norm": 36.91218473973044, + "learning_rate": 1.1100850712400558e-07, + "logits/chosen": -1.392214298248291, + "logits/rejected": -1.4010385274887085, + "logps/chosen": -678.6796264648438, + "logps/rejected": -669.0376586914062, + "loss": 0.5327, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.425049066543579, + "rewards/margins": 0.2620934844017029, + "rewards/rejected": -2.6871426105499268, + "step": 623 + }, + { + "epoch": 0.40754347293656623, + "grad_norm": 7.985703028509244, + "learning_rate": 1.1085831412623295e-07, + "logits/chosen": -1.5507965087890625, + "logits/rejected": -1.5302854776382446, + "logps/chosen": -759.2698364257812, + "logps/rejected": -793.293701171875, + "loss": 0.5103, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.41506290435791, + "rewards/margins": 0.651451826095581, + "rewards/rejected": -3.0665149688720703, + "step": 624 + }, + { + "epoch": 0.4081965874765287, + "grad_norm": 96.15150960782869, + "learning_rate": 1.107079344812325e-07, + "logits/chosen": -1.5727829933166504, + "logits/rejected": -1.583585500717163, + "logps/chosen": -742.6324462890625, + "logps/rejected": -827.9069213867188, + "loss": 0.5681, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.324497938156128, + "rewards/margins": 0.5150927901268005, + "rewards/rejected": -2.839590549468994, + "step": 625 + }, + { + "epoch": 0.4088497020164911, + "grad_norm": 44.171131036436144, + "learning_rate": 1.1055736897175004e-07, + "logits/chosen": -1.5920262336730957, + "logits/rejected": -1.5330153703689575, + "logps/chosen": -811.8800048828125, + "logps/rejected": -813.5941772460938, + "loss": 0.5467, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6739134788513184, + "rewards/margins": 0.49917757511138916, + "rewards/rejected": -3.173091173171997, + "step": 626 + }, + { + "epoch": 0.4095028165564536, + "grad_norm": 56.76814949785248, + "learning_rate": 1.1040661838149878e-07, + "logits/chosen": -1.5579776763916016, + "logits/rejected": -1.5597628355026245, + "logps/chosen": -807.7301025390625, + "logps/rejected": -894.228515625, + "loss": 0.4822, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6176559925079346, + "rewards/margins": 0.6038171648979187, + "rewards/rejected": -3.221472978591919, + "step": 627 + }, + { + "epoch": 0.410155931096416, + "grad_norm": 26.749818630716884, + "learning_rate": 1.1025568349515528e-07, + "logits/chosen": -1.4747806787490845, + "logits/rejected": -1.4980195760726929, + "logps/chosen": -683.0436401367188, + "logps/rejected": -686.6530151367188, + "loss": 0.5013, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.168447732925415, + "rewards/margins": 0.4241008460521698, + "rewards/rejected": -2.5925486087799072, + "step": 628 + }, + { + "epoch": 0.4108090456363785, + "grad_norm": 76.09203034611112, + "learning_rate": 1.1010456509835548e-07, + "logits/chosen": -1.6177140474319458, + "logits/rejected": -1.6009244918823242, + "logps/chosen": -720.8223876953125, + "logps/rejected": -741.0955810546875, + "loss": 0.5919, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.544870376586914, + "rewards/margins": 0.48653483390808105, + "rewards/rejected": -3.031405210494995, + "step": 629 + }, + { + "epoch": 0.4114621601763409, + "grad_norm": 52.43809285107618, + "learning_rate": 1.0995326397769042e-07, + "logits/chosen": -1.529783010482788, + "logits/rejected": -1.5328000783920288, + "logps/chosen": -664.4801025390625, + "logps/rejected": -768.4127807617188, + "loss": 0.4978, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.260840654373169, + "rewards/margins": 0.6211092472076416, + "rewards/rejected": -2.8819501399993896, + "step": 630 + }, + { + "epoch": 0.4121152747163034, + "grad_norm": 9.495957129740592, + "learning_rate": 1.0980178092070225e-07, + "logits/chosen": -1.578983187675476, + "logits/rejected": -1.5786319971084595, + "logps/chosen": -745.9150390625, + "logps/rejected": -808.736083984375, + "loss": 0.5322, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5862929821014404, + "rewards/margins": 0.4649811387062073, + "rewards/rejected": -3.051273822784424, + "step": 631 + }, + { + "epoch": 0.4127683892562658, + "grad_norm": 40.60424346315586, + "learning_rate": 1.0965011671588021e-07, + "logits/chosen": -1.5746647119522095, + "logits/rejected": -1.554513931274414, + "logps/chosen": -797.9632568359375, + "logps/rejected": -785.4474487304688, + "loss": 0.601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.463163375854492, + "rewards/margins": 0.24002833664417267, + "rewards/rejected": -2.7031917572021484, + "step": 632 + }, + { + "epoch": 0.4134215037962283, + "grad_norm": 52.831172156435784, + "learning_rate": 1.094982721526563e-07, + "logits/chosen": -1.540035367012024, + "logits/rejected": -1.5365304946899414, + "logps/chosen": -826.9459228515625, + "logps/rejected": -893.4970703125, + "loss": 0.4984, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0703601837158203, + "rewards/margins": 0.7675031423568726, + "rewards/rejected": -3.8378632068634033, + "step": 633 + }, + { + "epoch": 0.4140746183361907, + "grad_norm": 15.726901054602733, + "learning_rate": 1.0934624802140147e-07, + "logits/chosen": -1.5329731702804565, + "logits/rejected": -1.5518198013305664, + "logps/chosen": -763.7630615234375, + "logps/rejected": -806.2711791992188, + "loss": 0.5294, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.819103479385376, + "rewards/margins": 0.638033926486969, + "rewards/rejected": -3.457137107849121, + "step": 634 + }, + { + "epoch": 0.41472773287615317, + "grad_norm": 18.674657678831167, + "learning_rate": 1.0919404511342121e-07, + "logits/chosen": -1.5634597539901733, + "logits/rejected": -1.5584088563919067, + "logps/chosen": -730.9223022460938, + "logps/rejected": -818.416259765625, + "loss": 0.5022, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.1986281871795654, + "rewards/margins": 0.8553330898284912, + "rewards/rejected": -3.0539615154266357, + "step": 635 + }, + { + "epoch": 0.4153808474161156, + "grad_norm": 19.5510889690524, + "learning_rate": 1.0904166422095162e-07, + "logits/chosen": -1.5800487995147705, + "logits/rejected": -1.5099263191223145, + "logps/chosen": -743.9994506835938, + "logps/rejected": -761.8692016601562, + "loss": 0.5499, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8127152919769287, + "rewards/margins": 0.35885605216026306, + "rewards/rejected": -3.1715714931488037, + "step": 636 + }, + { + "epoch": 0.41603396195607806, + "grad_norm": 29.4349268646558, + "learning_rate": 1.0888910613715523e-07, + "logits/chosen": -1.5855112075805664, + "logits/rejected": -1.5745254755020142, + "logps/chosen": -724.60888671875, + "logps/rejected": -817.6202392578125, + "loss": 0.5035, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4678425788879395, + "rewards/margins": 0.9483436346054077, + "rewards/rejected": -3.416186571121216, + "step": 637 + }, + { + "epoch": 0.4166870764960405, + "grad_norm": 27.606512517928415, + "learning_rate": 1.0873637165611688e-07, + "logits/chosen": -1.5267822742462158, + "logits/rejected": -1.5166692733764648, + "logps/chosen": -756.2583618164062, + "logps/rejected": -905.69287109375, + "loss": 0.5459, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.325356960296631, + "rewards/margins": 0.8004791736602783, + "rewards/rejected": -3.125836133956909, + "step": 638 + }, + { + "epoch": 0.41734019103600295, + "grad_norm": 10.112999624727967, + "learning_rate": 1.085834615728396e-07, + "logits/chosen": -1.5627224445343018, + "logits/rejected": -1.589940071105957, + "logps/chosen": -757.5079345703125, + "logps/rejected": -797.4879150390625, + "loss": 0.5334, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7449193000793457, + "rewards/margins": 0.47740480303764343, + "rewards/rejected": -3.2223243713378906, + "step": 639 + }, + { + "epoch": 0.41799330557596537, + "grad_norm": 49.554186475304675, + "learning_rate": 1.0843037668324037e-07, + "logits/chosen": -1.6192628145217896, + "logits/rejected": -1.6073408126831055, + "logps/chosen": -692.6961669921875, + "logps/rejected": -731.430419921875, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.307335376739502, + "rewards/margins": 0.42022186517715454, + "rewards/rejected": -2.7275571823120117, + "step": 640 + }, + { + "epoch": 0.41864642011592784, + "grad_norm": 12.12872006128116, + "learning_rate": 1.0827711778414616e-07, + "logits/chosen": -1.5925276279449463, + "logits/rejected": -1.617337703704834, + "logps/chosen": -773.3049926757812, + "logps/rejected": -886.1497192382812, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7378339767456055, + "rewards/margins": 0.9473193287849426, + "rewards/rejected": -3.6851532459259033, + "step": 641 + }, + { + "epoch": 0.41929953465589026, + "grad_norm": 9.420684381655802, + "learning_rate": 1.0812368567328965e-07, + "logits/chosen": -1.5554149150848389, + "logits/rejected": -1.5351797342300415, + "logps/chosen": -741.2017211914062, + "logps/rejected": -848.9796752929688, + "loss": 0.5536, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7187960147857666, + "rewards/margins": 0.7441635727882385, + "rewards/rejected": -3.4629597663879395, + "step": 642 + }, + { + "epoch": 0.41995264919585273, + "grad_norm": 17.59371870537502, + "learning_rate": 1.0797008114930504e-07, + "logits/chosen": -1.5738487243652344, + "logits/rejected": -1.5535151958465576, + "logps/chosen": -871.8766479492188, + "logps/rejected": -991.9896240234375, + "loss": 0.52, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.753142833709717, + "rewards/margins": 0.8435327410697937, + "rewards/rejected": -3.5966756343841553, + "step": 643 + }, + { + "epoch": 0.42060576373581515, + "grad_norm": 65.67234375195817, + "learning_rate": 1.078163050117241e-07, + "logits/chosen": -1.5654664039611816, + "logits/rejected": -1.5345994234085083, + "logps/chosen": -724.1328735351562, + "logps/rejected": -764.9933471679688, + "loss": 0.5788, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.66972279548645, + "rewards/margins": 0.42094868421554565, + "rewards/rejected": -3.0906717777252197, + "step": 644 + }, + { + "epoch": 0.4212588782757776, + "grad_norm": 29.19611600663847, + "learning_rate": 1.0766235806097172e-07, + "logits/chosen": -1.6037955284118652, + "logits/rejected": -1.552761197090149, + "logps/chosen": -850.3397216796875, + "logps/rejected": -891.075927734375, + "loss": 0.498, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.936112880706787, + "rewards/margins": 0.9705631136894226, + "rewards/rejected": -3.9066760540008545, + "step": 645 + }, + { + "epoch": 0.42191199281574004, + "grad_norm": 55.001913874158426, + "learning_rate": 1.0750824109836202e-07, + "logits/chosen": -1.63307785987854, + "logits/rejected": -1.5964423418045044, + "logps/chosen": -867.8717041015625, + "logps/rejected": -924.471435546875, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.914452314376831, + "rewards/margins": 0.48612168431282043, + "rewards/rejected": -3.40057373046875, + "step": 646 + }, + { + "epoch": 0.4225651073557025, + "grad_norm": 28.42526755883717, + "learning_rate": 1.0735395492609401e-07, + "logits/chosen": -1.477207899093628, + "logits/rejected": -1.4840601682662964, + "logps/chosen": -654.694091796875, + "logps/rejected": -722.9656982421875, + "loss": 0.5245, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.427155017852783, + "rewards/margins": 0.7215266227722168, + "rewards/rejected": -3.148681879043579, + "step": 647 + }, + { + "epoch": 0.42321822189566494, + "grad_norm": 20.153695342892174, + "learning_rate": 1.0719950034724741e-07, + "logits/chosen": -1.6053216457366943, + "logits/rejected": -1.58915114402771, + "logps/chosen": -833.958251953125, + "logps/rejected": -869.42236328125, + "loss": 0.5383, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8524227142333984, + "rewards/margins": 0.6670438051223755, + "rewards/rejected": -3.5194666385650635, + "step": 648 + }, + { + "epoch": 0.4238713364356274, + "grad_norm": 40.45524219360296, + "learning_rate": 1.0704487816577857e-07, + "logits/chosen": -1.5915526151657104, + "logits/rejected": -1.5662232637405396, + "logps/chosen": -755.484619140625, + "logps/rejected": -798.5493774414062, + "loss": 0.5185, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.445694923400879, + "rewards/margins": 0.6455334424972534, + "rewards/rejected": -3.0912282466888428, + "step": 649 + }, + { + "epoch": 0.42452445097558983, + "grad_norm": 45.48878539730004, + "learning_rate": 1.0689008918651624e-07, + "logits/chosen": -1.4695310592651367, + "logits/rejected": -1.4832361936569214, + "logps/chosen": -884.2122802734375, + "logps/rejected": -879.8310546875, + "loss": 0.5647, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.048762559890747, + "rewards/margins": 0.2833816707134247, + "rewards/rejected": -3.332144260406494, + "step": 650 + }, + { + "epoch": 0.4251775655155523, + "grad_norm": 56.42516159244524, + "learning_rate": 1.0673513421515733e-07, + "logits/chosen": -1.6036771535873413, + "logits/rejected": -1.617321252822876, + "logps/chosen": -794.57421875, + "logps/rejected": -828.6959838867188, + "loss": 0.5217, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.69077467918396, + "rewards/margins": 0.46351078152656555, + "rewards/rejected": -3.154285430908203, + "step": 651 + }, + { + "epoch": 0.4258306800555147, + "grad_norm": 46.62254829895216, + "learning_rate": 1.0658001405826283e-07, + "logits/chosen": -1.5648261308670044, + "logits/rejected": -1.5797994136810303, + "logps/chosen": -772.7039184570312, + "logps/rejected": -819.6089477539062, + "loss": 0.5396, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7385077476501465, + "rewards/margins": 0.5473735332489014, + "rewards/rejected": -3.285881519317627, + "step": 652 + }, + { + "epoch": 0.4264837945954772, + "grad_norm": 59.34200258381232, + "learning_rate": 1.0642472952325346e-07, + "logits/chosen": -1.5929808616638184, + "logits/rejected": -1.6374764442443848, + "logps/chosen": -735.03662109375, + "logps/rejected": -850.94189453125, + "loss": 0.4824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.357342004776001, + "rewards/margins": 1.2214264869689941, + "rewards/rejected": -3.578768253326416, + "step": 653 + }, + { + "epoch": 0.4271369091354396, + "grad_norm": 37.46472982323093, + "learning_rate": 1.062692814184056e-07, + "logits/chosen": -1.5046641826629639, + "logits/rejected": -1.4834293127059937, + "logps/chosen": -670.6259765625, + "logps/rejected": -814.8147583007812, + "loss": 0.5009, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.507924795150757, + "rewards/margins": 0.8526814579963684, + "rewards/rejected": -3.3606061935424805, + "step": 654 + }, + { + "epoch": 0.4277900236754021, + "grad_norm": 65.41811386558527, + "learning_rate": 1.0611367055284704e-07, + "logits/chosen": -1.5957469940185547, + "logits/rejected": -1.6134015321731567, + "logps/chosen": -910.1995239257812, + "logps/rejected": -958.0068969726562, + "loss": 0.5173, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.841102123260498, + "rewards/margins": 0.8195388913154602, + "rewards/rejected": -3.6606409549713135, + "step": 655 + }, + { + "epoch": 0.4284431382153645, + "grad_norm": 15.408196632262841, + "learning_rate": 1.0595789773655273e-07, + "logits/chosen": -1.5747413635253906, + "logits/rejected": -1.565165400505066, + "logps/chosen": -880.7881469726562, + "logps/rejected": -917.05419921875, + "loss": 0.5269, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.843327760696411, + "rewards/margins": 0.5438075661659241, + "rewards/rejected": -3.3871350288391113, + "step": 656 + }, + { + "epoch": 0.429096252755327, + "grad_norm": 12.34615445550627, + "learning_rate": 1.0580196378034061e-07, + "logits/chosen": -1.5093989372253418, + "logits/rejected": -1.528839111328125, + "logps/chosen": -804.33837890625, + "logps/rejected": -832.3118896484375, + "loss": 0.5247, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.883572816848755, + "rewards/margins": 0.4760682284832001, + "rewards/rejected": -3.3596410751342773, + "step": 657 + }, + { + "epoch": 0.4297493672952894, + "grad_norm": 20.79744618252237, + "learning_rate": 1.0564586949586736e-07, + "logits/chosen": -1.5655585527420044, + "logits/rejected": -1.5473308563232422, + "logps/chosen": -784.39208984375, + "logps/rejected": -780.73974609375, + "loss": 0.5298, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7032248973846436, + "rewards/margins": 0.7658544182777405, + "rewards/rejected": -3.46907901763916, + "step": 658 + }, + { + "epoch": 0.43040248183525187, + "grad_norm": 9.541823346620628, + "learning_rate": 1.0548961569562423e-07, + "logits/chosen": -1.551105260848999, + "logits/rejected": -1.5648680925369263, + "logps/chosen": -728.1370239257812, + "logps/rejected": -863.0052490234375, + "loss": 0.5929, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4059126377105713, + "rewards/margins": 0.7981605529785156, + "rewards/rejected": -3.2040724754333496, + "step": 659 + }, + { + "epoch": 0.4310555963752143, + "grad_norm": 48.77651510525204, + "learning_rate": 1.0533320319293272e-07, + "logits/chosen": -1.521799087524414, + "logits/rejected": -1.5136668682098389, + "logps/chosen": -767.982421875, + "logps/rejected": -844.113037109375, + "loss": 0.5468, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.578664541244507, + "rewards/margins": 0.6596671342849731, + "rewards/rejected": -3.2383313179016113, + "step": 660 + }, + { + "epoch": 0.43170871091517676, + "grad_norm": 14.135228489391437, + "learning_rate": 1.0517663280194042e-07, + "logits/chosen": -1.541378140449524, + "logits/rejected": -1.5287246704101562, + "logps/chosen": -854.5809326171875, + "logps/rejected": -845.8003540039062, + "loss": 0.5743, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7386510372161865, + "rewards/margins": 0.5117663145065308, + "rewards/rejected": -3.2504169940948486, + "step": 661 + }, + { + "epoch": 0.4323618254551392, + "grad_norm": 39.96027400878076, + "learning_rate": 1.050199053376168e-07, + "logits/chosen": -1.5787113904953003, + "logits/rejected": -1.5196938514709473, + "logps/chosen": -895.7177734375, + "logps/rejected": -902.5272827148438, + "loss": 0.5603, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0833802223205566, + "rewards/margins": 0.44520139694213867, + "rewards/rejected": -3.5285820960998535, + "step": 662 + }, + { + "epoch": 0.43301493999510166, + "grad_norm": 9.336642307745894, + "learning_rate": 1.0486302161574876e-07, + "logits/chosen": -1.5741958618164062, + "logits/rejected": -1.566494345664978, + "logps/chosen": -747.9061889648438, + "logps/rejected": -897.3228759765625, + "loss": 0.5216, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3504905700683594, + "rewards/margins": 1.2214593887329102, + "rewards/rejected": -3.5719499588012695, + "step": 663 + }, + { + "epoch": 0.4336680545350641, + "grad_norm": 34.74703628355522, + "learning_rate": 1.0470598245293676e-07, + "logits/chosen": -1.5200121402740479, + "logits/rejected": -1.537473201751709, + "logps/chosen": -792.9605102539062, + "logps/rejected": -799.4251708984375, + "loss": 0.5492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7280704975128174, + "rewards/margins": 0.4831348657608032, + "rewards/rejected": -3.211205244064331, + "step": 664 + }, + { + "epoch": 0.43432116907502655, + "grad_norm": 26.166178900391024, + "learning_rate": 1.0454878866659017e-07, + "logits/chosen": -1.591930627822876, + "logits/rejected": -1.5695099830627441, + "logps/chosen": -727.5706787109375, + "logps/rejected": -792.0450439453125, + "loss": 0.5506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6660943031311035, + "rewards/margins": 0.6367244124412537, + "rewards/rejected": -3.302818775177002, + "step": 665 + }, + { + "epoch": 0.43497428361498897, + "grad_norm": 10.372172666812576, + "learning_rate": 1.0439144107492328e-07, + "logits/chosen": -1.4827499389648438, + "logits/rejected": -1.4672393798828125, + "logps/chosen": -800.9483032226562, + "logps/rejected": -798.546875, + "loss": 0.5153, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8575448989868164, + "rewards/margins": 0.5324373841285706, + "rewards/rejected": -3.389982223510742, + "step": 666 + }, + { + "epoch": 0.43562739815495144, + "grad_norm": 11.965156702513355, + "learning_rate": 1.0423394049695095e-07, + "logits/chosen": -1.5964605808258057, + "logits/rejected": -1.5629247426986694, + "logps/chosen": -812.4568481445312, + "logps/rejected": -916.690185546875, + "loss": 0.5439, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.873971939086914, + "rewards/margins": 0.766517162322998, + "rewards/rejected": -3.640489101409912, + "step": 667 + }, + { + "epoch": 0.43628051269491386, + "grad_norm": 17.25391970334388, + "learning_rate": 1.0407628775248433e-07, + "logits/chosen": -1.5258575677871704, + "logits/rejected": -1.557664155960083, + "logps/chosen": -774.147216796875, + "logps/rejected": -935.6986083984375, + "loss": 0.4742, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4668354988098145, + "rewards/margins": 1.3980622291564941, + "rewards/rejected": -3.8648977279663086, + "step": 668 + }, + { + "epoch": 0.43693362723487633, + "grad_norm": 14.901011998748888, + "learning_rate": 1.0391848366212666e-07, + "logits/chosen": -1.594072937965393, + "logits/rejected": -1.587380051612854, + "logps/chosen": -780.095703125, + "logps/rejected": -784.4081420898438, + "loss": 0.5749, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.8471426963806152, + "rewards/margins": 0.19949409365653992, + "rewards/rejected": -3.0466368198394775, + "step": 669 + }, + { + "epoch": 0.43758674177483875, + "grad_norm": 46.17531372330182, + "learning_rate": 1.0376052904726888e-07, + "logits/chosen": -1.6172115802764893, + "logits/rejected": -1.5498206615447998, + "logps/chosen": -820.4274291992188, + "logps/rejected": -908.1333618164062, + "loss": 0.5152, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.615933656692505, + "rewards/margins": 0.7912464737892151, + "rewards/rejected": -3.407179832458496, + "step": 670 + }, + { + "epoch": 0.4382398563148012, + "grad_norm": 51.65597788618139, + "learning_rate": 1.0360242473008551e-07, + "logits/chosen": -1.5762639045715332, + "logits/rejected": -1.594434380531311, + "logps/chosen": -824.281982421875, + "logps/rejected": -829.8516845703125, + "loss": 0.5618, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.047189712524414, + "rewards/margins": 0.6297323703765869, + "rewards/rejected": -3.67692232131958, + "step": 671 + }, + { + "epoch": 0.43889297085476364, + "grad_norm": 33.441136796480215, + "learning_rate": 1.0344417153353023e-07, + "logits/chosen": -1.4966257810592651, + "logits/rejected": -1.459989309310913, + "logps/chosen": -817.207763671875, + "logps/rejected": -897.0996704101562, + "loss": 0.5394, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9682440757751465, + "rewards/margins": 0.6141051054000854, + "rewards/rejected": -3.5823495388031006, + "step": 672 + }, + { + "epoch": 0.4395460853947261, + "grad_norm": 31.378606252739424, + "learning_rate": 1.0328577028133171e-07, + "logits/chosen": -1.5966362953186035, + "logits/rejected": -1.6483103036880493, + "logps/chosen": -732.49609375, + "logps/rejected": -800.7023315429688, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.764153003692627, + "rewards/margins": 0.5777581930160522, + "rewards/rejected": -3.341911554336548, + "step": 673 + }, + { + "epoch": 0.44019919993468853, + "grad_norm": 34.56431138782349, + "learning_rate": 1.0312722179798924e-07, + "logits/chosen": -1.598524808883667, + "logits/rejected": -1.6316325664520264, + "logps/chosen": -787.64892578125, + "logps/rejected": -861.8873291015625, + "loss": 0.5273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7730770111083984, + "rewards/margins": 0.7392612099647522, + "rewards/rejected": -3.5123379230499268, + "step": 674 + }, + { + "epoch": 0.440852314474651, + "grad_norm": 15.372795733191637, + "learning_rate": 1.0296852690876846e-07, + "logits/chosen": -1.5273460149765015, + "logits/rejected": -1.5593777894973755, + "logps/chosen": -790.3528442382812, + "logps/rejected": -815.5953979492188, + "loss": 0.5533, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.494476079940796, + "rewards/margins": 0.4577151834964752, + "rewards/rejected": -2.95219087600708, + "step": 675 + }, + { + "epoch": 0.4415054290146134, + "grad_norm": 36.80495254523571, + "learning_rate": 1.0280968643969706e-07, + "logits/chosen": -1.5882431268692017, + "logits/rejected": -1.5868853330612183, + "logps/chosen": -863.531005859375, + "logps/rejected": -894.946044921875, + "loss": 0.5153, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.805412769317627, + "rewards/margins": 0.8240199685096741, + "rewards/rejected": -3.629432201385498, + "step": 676 + }, + { + "epoch": 0.4421585435545759, + "grad_norm": 9.850541147286798, + "learning_rate": 1.0265070121756054e-07, + "logits/chosen": -1.5755228996276855, + "logits/rejected": -1.5751676559448242, + "logps/chosen": -729.009521484375, + "logps/rejected": -794.595947265625, + "loss": 0.5564, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3728232383728027, + "rewards/margins": 0.473653107881546, + "rewards/rejected": -2.8464765548706055, + "step": 677 + }, + { + "epoch": 0.4428116580945383, + "grad_norm": 8.929995911050677, + "learning_rate": 1.0249157206989785e-07, + "logits/chosen": -1.5073692798614502, + "logits/rejected": -1.542021632194519, + "logps/chosen": -802.9788818359375, + "logps/rejected": -878.1073608398438, + "loss": 0.4953, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.467076301574707, + "rewards/margins": 0.7465323209762573, + "rewards/rejected": -3.213608503341675, + "step": 678 + }, + { + "epoch": 0.4434647726345008, + "grad_norm": 17.43336381715629, + "learning_rate": 1.0233229982499702e-07, + "logits/chosen": -1.5233006477355957, + "logits/rejected": -1.575763463973999, + "logps/chosen": -728.665771484375, + "logps/rejected": -826.36279296875, + "loss": 0.5105, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.66928768157959, + "rewards/margins": 0.6221798062324524, + "rewards/rejected": -3.2914676666259766, + "step": 679 + }, + { + "epoch": 0.4441178871744632, + "grad_norm": 8.33574898866346, + "learning_rate": 1.0217288531189101e-07, + "logits/chosen": -1.4891436100006104, + "logits/rejected": -1.487116813659668, + "logps/chosen": -790.6260375976562, + "logps/rejected": -806.4535522460938, + "loss": 0.5277, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7111411094665527, + "rewards/margins": 0.511049747467041, + "rewards/rejected": -3.2221908569335938, + "step": 680 + }, + { + "epoch": 0.4447710017144257, + "grad_norm": 93.25103994990552, + "learning_rate": 1.0201332936035328e-07, + "logits/chosen": -1.5428980588912964, + "logits/rejected": -1.5394231081008911, + "logps/chosen": -750.823486328125, + "logps/rejected": -812.9085083007812, + "loss": 0.6195, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5228466987609863, + "rewards/margins": 0.6121708750724792, + "rewards/rejected": -3.1350173950195312, + "step": 681 + }, + { + "epoch": 0.4454241162543881, + "grad_norm": 24.723412198537826, + "learning_rate": 1.0185363280089346e-07, + "logits/chosen": -1.5300703048706055, + "logits/rejected": -1.5563037395477295, + "logps/chosen": -808.7451782226562, + "logps/rejected": -814.0613403320312, + "loss": 0.5547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.553114891052246, + "rewards/margins": 0.2794783413410187, + "rewards/rejected": -2.8325929641723633, + "step": 682 + }, + { + "epoch": 0.4460772307943506, + "grad_norm": 18.06928652509424, + "learning_rate": 1.0169379646475307e-07, + "logits/chosen": -1.5730717182159424, + "logits/rejected": -1.553600549697876, + "logps/chosen": -827.7550048828125, + "logps/rejected": -824.0142211914062, + "loss": 0.5089, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.8725967407226562, + "rewards/margins": 0.5028645992279053, + "rewards/rejected": -3.3754611015319824, + "step": 683 + }, + { + "epoch": 0.446730345334313, + "grad_norm": 13.432734617314106, + "learning_rate": 1.0153382118390124e-07, + "logits/chosen": -1.517228364944458, + "logits/rejected": -1.5097568035125732, + "logps/chosen": -828.512451171875, + "logps/rejected": -960.2579345703125, + "loss": 0.5539, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.074080467224121, + "rewards/margins": 0.7247848510742188, + "rewards/rejected": -3.7988648414611816, + "step": 684 + }, + { + "epoch": 0.44738345987427547, + "grad_norm": 111.08651677833566, + "learning_rate": 1.0137370779103024e-07, + "logits/chosen": -1.6381279230117798, + "logits/rejected": -1.642504334449768, + "logps/chosen": -895.6856079101562, + "logps/rejected": -954.673828125, + "loss": 0.4931, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.942159652709961, + "rewards/margins": 0.6529674530029297, + "rewards/rejected": -3.5951271057128906, + "step": 685 + }, + { + "epoch": 0.4480365744142379, + "grad_norm": 13.010841457025625, + "learning_rate": 1.0121345711955134e-07, + "logits/chosen": -1.4719197750091553, + "logits/rejected": -1.4533613920211792, + "logps/chosen": -753.68408203125, + "logps/rejected": -819.1432495117188, + "loss": 0.5515, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3883912563323975, + "rewards/margins": 0.776418149471283, + "rewards/rejected": -3.1648097038269043, + "step": 686 + }, + { + "epoch": 0.44868968895420036, + "grad_norm": 19.59305991487636, + "learning_rate": 1.0105307000359027e-07, + "logits/chosen": -1.5581806898117065, + "logits/rejected": -1.563573956489563, + "logps/chosen": -782.7869873046875, + "logps/rejected": -825.7442626953125, + "loss": 0.5426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6787917613983154, + "rewards/margins": 0.797275722026825, + "rewards/rejected": -3.4760677814483643, + "step": 687 + }, + { + "epoch": 0.4493428034941628, + "grad_norm": 41.30661441239489, + "learning_rate": 1.0089254727798299e-07, + "logits/chosen": -1.5456461906433105, + "logits/rejected": -1.5804781913757324, + "logps/chosen": -802.162841796875, + "logps/rejected": -830.0077514648438, + "loss": 0.5099, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7756364345550537, + "rewards/margins": 0.779686450958252, + "rewards/rejected": -3.5553228855133057, + "step": 688 + }, + { + "epoch": 0.44999591803412525, + "grad_norm": 43.257182510610015, + "learning_rate": 1.0073188977827134e-07, + "logits/chosen": -1.530835509300232, + "logits/rejected": -1.5210024118423462, + "logps/chosen": -774.5211791992188, + "logps/rejected": -830.6015625, + "loss": 0.4923, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.939332962036133, + "rewards/margins": 0.8855284452438354, + "rewards/rejected": -3.824861526489258, + "step": 689 + }, + { + "epoch": 0.45064903257408767, + "grad_norm": 28.06793800668904, + "learning_rate": 1.005710983406987e-07, + "logits/chosen": -1.5865156650543213, + "logits/rejected": -1.536832571029663, + "logps/chosen": -714.5545654296875, + "logps/rejected": -739.9314575195312, + "loss": 0.5408, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3614490032196045, + "rewards/margins": 0.4486488997936249, + "rewards/rejected": -2.8100976943969727, + "step": 690 + }, + { + "epoch": 0.45130214711405015, + "grad_norm": 13.710061169800268, + "learning_rate": 1.0041017380220558e-07, + "logits/chosen": -1.6234124898910522, + "logits/rejected": -1.6429170370101929, + "logps/chosen": -942.2227783203125, + "logps/rejected": -983.3606567382812, + "loss": 0.5934, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9503729343414307, + "rewards/margins": 0.5114213824272156, + "rewards/rejected": -3.461794376373291, + "step": 691 + }, + { + "epoch": 0.45195526165401256, + "grad_norm": 42.74504837687908, + "learning_rate": 1.002491170004253e-07, + "logits/chosen": -1.5679419040679932, + "logits/rejected": -1.5443594455718994, + "logps/chosen": -842.61474609375, + "logps/rejected": -882.8880004882812, + "loss": 0.5011, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7981529235839844, + "rewards/margins": 0.8635019659996033, + "rewards/rejected": -3.6616549491882324, + "step": 692 + }, + { + "epoch": 0.45260837619397504, + "grad_norm": 51.01361393063693, + "learning_rate": 1.0008792877367964e-07, + "logits/chosen": -1.6048243045806885, + "logits/rejected": -1.5834286212921143, + "logps/chosen": -800.6567993164062, + "logps/rejected": -932.0943603515625, + "loss": 0.6234, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7766685485839844, + "rewards/margins": 0.7954378128051758, + "rewards/rejected": -3.57210636138916, + "step": 693 + }, + { + "epoch": 0.45326149073393746, + "grad_norm": 26.952246598872957, + "learning_rate": 9.992660996097447e-08, + "logits/chosen": -1.487755537033081, + "logits/rejected": -1.475599765777588, + "logps/chosen": -764.441162109375, + "logps/rejected": -858.9654541015625, + "loss": 0.471, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6644558906555176, + "rewards/margins": 0.7122585773468018, + "rewards/rejected": -3.3767144680023193, + "step": 694 + }, + { + "epoch": 0.45391460527389993, + "grad_norm": 11.341690238802435, + "learning_rate": 9.976516140199535e-08, + "logits/chosen": -1.654636025428772, + "logits/rejected": -1.6279263496398926, + "logps/chosen": -713.5647583007812, + "logps/rejected": -748.8384399414062, + "loss": 0.5867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.517662525177002, + "rewards/margins": 0.454134076833725, + "rewards/rejected": -2.971796989440918, + "step": 695 + }, + { + "epoch": 0.45456771981386235, + "grad_norm": 40.33369925086035, + "learning_rate": 9.960358393710321e-08, + "logits/chosen": -1.528939962387085, + "logits/rejected": -1.5497705936431885, + "logps/chosen": -764.209716796875, + "logps/rejected": -920.8458862304688, + "loss": 0.4745, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5999767780303955, + "rewards/margins": 1.0393584966659546, + "rewards/rejected": -3.6393353939056396, + "step": 696 + }, + { + "epoch": 0.4552208343538248, + "grad_norm": 12.320269188700282, + "learning_rate": 9.944187840732994e-08, + "logits/chosen": -1.5356950759887695, + "logits/rejected": -1.5276272296905518, + "logps/chosen": -824.4510498046875, + "logps/rejected": -828.6773681640625, + "loss": 0.4832, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7461459636688232, + "rewards/margins": 0.7607426047325134, + "rewards/rejected": -3.5068888664245605, + "step": 697 + }, + { + "epoch": 0.45587394889378724, + "grad_norm": 20.693131756565563, + "learning_rate": 9.928004565437409e-08, + "logits/chosen": -1.5997885465621948, + "logits/rejected": -1.596414566040039, + "logps/chosen": -749.1096801757812, + "logps/rejected": -785.3474731445312, + "loss": 0.5544, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.6378190517425537, + "rewards/margins": 0.5456668138504028, + "rewards/rejected": -3.183485746383667, + "step": 698 + }, + { + "epoch": 0.4565270634337497, + "grad_norm": 31.393879848402715, + "learning_rate": 9.911808652059627e-08, + "logits/chosen": -1.5327155590057373, + "logits/rejected": -1.5094130039215088, + "logps/chosen": -713.3516235351562, + "logps/rejected": -709.5831298828125, + "loss": 0.565, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3836562633514404, + "rewards/margins": 0.40900933742523193, + "rewards/rejected": -2.792665719985962, + "step": 699 + }, + { + "epoch": 0.45718017797371213, + "grad_norm": 44.56122798633119, + "learning_rate": 9.895600184901504e-08, + "logits/chosen": -1.6142834424972534, + "logits/rejected": -1.5989428758621216, + "logps/chosen": -814.4269409179688, + "logps/rejected": -822.01513671875, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7036519050598145, + "rewards/margins": 0.4850347638130188, + "rewards/rejected": -3.1886863708496094, + "step": 700 + }, + { + "epoch": 0.45718017797371213, + "eval_logits/chosen": -1.5524276494979858, + "eval_logits/rejected": -1.5454105138778687, + "eval_logps/chosen": -772.6592407226562, + "eval_logps/rejected": -832.8743896484375, + "eval_loss": 0.5305144190788269, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -2.623138427734375, + "eval_rewards/margins": 0.6832101345062256, + "eval_rewards/rejected": -3.3063488006591797, + "eval_runtime": 296.7195, + "eval_samples_per_second": 13.481, + "eval_steps_per_second": 0.843, + "step": 700 + }, + { + "epoch": 0.4578332925136746, + "grad_norm": 8.9780374884061, + "learning_rate": 9.879379248330239e-08, + "logits/chosen": -1.5523990392684937, + "logits/rejected": -1.5356321334838867, + "logps/chosen": -745.2019653320312, + "logps/rejected": -832.36376953125, + "loss": 0.5029, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4654135704040527, + "rewards/margins": 0.739882230758667, + "rewards/rejected": -3.2052958011627197, + "step": 701 + }, + { + "epoch": 0.458486407053637, + "grad_norm": 13.764196067118446, + "learning_rate": 9.863145926777934e-08, + "logits/chosen": -1.5704573392868042, + "logits/rejected": -1.5471737384796143, + "logps/chosen": -749.1688232421875, + "logps/rejected": -803.7322998046875, + "loss": 0.5578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7841954231262207, + "rewards/margins": 0.7516999840736389, + "rewards/rejected": -3.535895347595215, + "step": 702 + }, + { + "epoch": 0.4591395215935995, + "grad_norm": 28.637922274224316, + "learning_rate": 9.846900304741157e-08, + "logits/chosen": -1.5777729749679565, + "logits/rejected": -1.5776381492614746, + "logps/chosen": -775.2662353515625, + "logps/rejected": -831.3828735351562, + "loss": 0.5208, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6320080757141113, + "rewards/margins": 0.6068910360336304, + "rewards/rejected": -3.238898992538452, + "step": 703 + }, + { + "epoch": 0.4597926361335619, + "grad_norm": 19.220774272497522, + "learning_rate": 9.830642466780502e-08, + "logits/chosen": -1.6617802381515503, + "logits/rejected": -1.6172996759414673, + "logps/chosen": -792.265380859375, + "logps/rejected": -793.1304931640625, + "loss": 0.5726, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6746013164520264, + "rewards/margins": 0.42366841435432434, + "rewards/rejected": -3.0982699394226074, + "step": 704 + }, + { + "epoch": 0.4604457506735244, + "grad_norm": 9.690502883669247, + "learning_rate": 9.814372497520143e-08, + "logits/chosen": -1.59645414352417, + "logits/rejected": -1.5628377199172974, + "logps/chosen": -759.610595703125, + "logps/rejected": -795.2174072265625, + "loss": 0.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5422470569610596, + "rewards/margins": 0.4729628562927246, + "rewards/rejected": -3.015209674835205, + "step": 705 + }, + { + "epoch": 0.4610988652134868, + "grad_norm": 29.25714731199898, + "learning_rate": 9.798090481647411e-08, + "logits/chosen": -1.5286840200424194, + "logits/rejected": -1.5571849346160889, + "logps/chosen": -745.8056640625, + "logps/rejected": -824.3461303710938, + "loss": 0.5433, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.467949867248535, + "rewards/margins": 0.9363622665405273, + "rewards/rejected": -3.4043126106262207, + "step": 706 + }, + { + "epoch": 0.4617519797534493, + "grad_norm": 11.95355469721655, + "learning_rate": 9.781796503912328e-08, + "logits/chosen": -1.5327837467193604, + "logits/rejected": -1.5477138757705688, + "logps/chosen": -788.408447265625, + "logps/rejected": -918.1892700195312, + "loss": 0.5018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.641738176345825, + "rewards/margins": 0.9118731617927551, + "rewards/rejected": -3.5536112785339355, + "step": 707 + }, + { + "epoch": 0.4624050942934117, + "grad_norm": 9.057095892979188, + "learning_rate": 9.765490649127187e-08, + "logits/chosen": -1.58297860622406, + "logits/rejected": -1.5312621593475342, + "logps/chosen": -640.93798828125, + "logps/rejected": -716.5929565429688, + "loss": 0.5052, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1810810565948486, + "rewards/margins": 0.6035025119781494, + "rewards/rejected": -2.784583568572998, + "step": 708 + }, + { + "epoch": 0.4630582088333742, + "grad_norm": 17.256671471200423, + "learning_rate": 9.749173002166101e-08, + "logits/chosen": -1.4686298370361328, + "logits/rejected": -1.5486806631088257, + "logps/chosen": -787.2227172851562, + "logps/rejected": -868.33056640625, + "loss": 0.5082, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.844477415084839, + "rewards/margins": 0.6099371314048767, + "rewards/rejected": -3.4544146060943604, + "step": 709 + }, + { + "epoch": 0.4637113233733366, + "grad_norm": 11.493776981683995, + "learning_rate": 9.732843647964563e-08, + "logits/chosen": -1.3998236656188965, + "logits/rejected": -1.3739439249038696, + "logps/chosen": -645.9183959960938, + "logps/rejected": -692.6560668945312, + "loss": 0.5008, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.068920135498047, + "rewards/margins": 0.3378119170665741, + "rewards/rejected": -2.4067318439483643, + "step": 710 + }, + { + "epoch": 0.46436443791329907, + "grad_norm": 19.407875344745552, + "learning_rate": 9.716502671519003e-08, + "logits/chosen": -1.594405174255371, + "logits/rejected": -1.5376571416854858, + "logps/chosen": -743.6107177734375, + "logps/rejected": -760.2347412109375, + "loss": 0.5606, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.476472854614258, + "rewards/margins": 0.6245778799057007, + "rewards/rejected": -3.101050853729248, + "step": 711 + }, + { + "epoch": 0.4650175524532615, + "grad_norm": 30.65022124829921, + "learning_rate": 9.700150157886345e-08, + "logits/chosen": -1.4861985445022583, + "logits/rejected": -1.4618767499923706, + "logps/chosen": -715.8246459960938, + "logps/rejected": -819.9178466796875, + "loss": 0.4772, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.780965805053711, + "rewards/margins": 0.6235308647155762, + "rewards/rejected": -3.404496669769287, + "step": 712 + }, + { + "epoch": 0.46567066699322396, + "grad_norm": 66.74879517812167, + "learning_rate": 9.683786192183569e-08, + "logits/chosen": -1.5658459663391113, + "logits/rejected": -1.5532565116882324, + "logps/chosen": -791.1273803710938, + "logps/rejected": -858.0960083007812, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.74170184135437, + "rewards/margins": 0.8308650255203247, + "rewards/rejected": -3.5725669860839844, + "step": 713 + }, + { + "epoch": 0.4663237815331864, + "grad_norm": 58.623748201586416, + "learning_rate": 9.667410859587261e-08, + "logits/chosen": -1.5724753141403198, + "logits/rejected": -1.581168532371521, + "logps/chosen": -814.78125, + "logps/rejected": -851.2848510742188, + "loss": 0.5508, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.974850654602051, + "rewards/margins": 0.2711948752403259, + "rewards/rejected": -3.2460455894470215, + "step": 714 + }, + { + "epoch": 0.46697689607314885, + "grad_norm": 12.956291639260309, + "learning_rate": 9.651024245333177e-08, + "logits/chosen": -1.5934419631958008, + "logits/rejected": -1.581264615058899, + "logps/chosen": -838.2115478515625, + "logps/rejected": -835.7385864257812, + "loss": 0.5465, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9340994358062744, + "rewards/margins": 0.6708505749702454, + "rewards/rejected": -3.604949951171875, + "step": 715 + }, + { + "epoch": 0.46763001061311127, + "grad_norm": 11.679151198008364, + "learning_rate": 9.634626434715791e-08, + "logits/chosen": -1.5744848251342773, + "logits/rejected": -1.5612497329711914, + "logps/chosen": -834.1600341796875, + "logps/rejected": -887.748779296875, + "loss": 0.5068, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9672343730926514, + "rewards/margins": 0.5058416128158569, + "rewards/rejected": -3.4730756282806396, + "step": 716 + }, + { + "epoch": 0.46828312515307374, + "grad_norm": 16.13898005368633, + "learning_rate": 9.618217513087857e-08, + "logits/chosen": -1.5531257390975952, + "logits/rejected": -1.5761160850524902, + "logps/chosen": -858.4584350585938, + "logps/rejected": -849.4945678710938, + "loss": 0.5375, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.655836820602417, + "rewards/margins": 0.46926283836364746, + "rewards/rejected": -3.1250996589660645, + "step": 717 + }, + { + "epoch": 0.46893623969303616, + "grad_norm": 29.758612473106314, + "learning_rate": 9.601797565859966e-08, + "logits/chosen": -1.5403785705566406, + "logits/rejected": -1.5410677194595337, + "logps/chosen": -761.43896484375, + "logps/rejected": -858.6434936523438, + "loss": 0.508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7271745204925537, + "rewards/margins": 0.8027867078781128, + "rewards/rejected": -3.529961585998535, + "step": 718 + }, + { + "epoch": 0.46958935423299863, + "grad_norm": 12.15073631227884, + "learning_rate": 9.585366678500099e-08, + "logits/chosen": -1.5667126178741455, + "logits/rejected": -1.5372388362884521, + "logps/chosen": -725.1177368164062, + "logps/rejected": -834.06494140625, + "loss": 0.528, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6709213256835938, + "rewards/margins": 0.7990758419036865, + "rewards/rejected": -3.469996929168701, + "step": 719 + }, + { + "epoch": 0.47024246877296105, + "grad_norm": 24.52142611093719, + "learning_rate": 9.568924936533176e-08, + "logits/chosen": -1.5142229795455933, + "logits/rejected": -1.5096334218978882, + "logps/chosen": -788.1256713867188, + "logps/rejected": -853.0916748046875, + "loss": 0.4793, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0224575996398926, + "rewards/margins": 0.8449983596801758, + "rewards/rejected": -3.8674559593200684, + "step": 720 + }, + { + "epoch": 0.4708955833129235, + "grad_norm": 12.298496353962463, + "learning_rate": 9.552472425540622e-08, + "logits/chosen": -1.4641120433807373, + "logits/rejected": -1.4405646324157715, + "logps/chosen": -814.8626708984375, + "logps/rejected": -852.576904296875, + "loss": 0.5757, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7843127250671387, + "rewards/margins": 0.5765048265457153, + "rewards/rejected": -3.3608171939849854, + "step": 721 + }, + { + "epoch": 0.47154869785288595, + "grad_norm": 9.717229033768323, + "learning_rate": 9.536009231159913e-08, + "logits/chosen": -1.5470542907714844, + "logits/rejected": -1.5365655422210693, + "logps/chosen": -879.9993896484375, + "logps/rejected": -939.4688720703125, + "loss": 0.4599, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9776620864868164, + "rewards/margins": 0.7803228497505188, + "rewards/rejected": -3.7579846382141113, + "step": 722 + }, + { + "epoch": 0.4722018123928484, + "grad_norm": 36.798678834078146, + "learning_rate": 9.519535439084134e-08, + "logits/chosen": -1.5727907419204712, + "logits/rejected": -1.5672595500946045, + "logps/chosen": -777.5218505859375, + "logps/rejected": -853.3557739257812, + "loss": 0.5406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.753126859664917, + "rewards/margins": 0.6771417260169983, + "rewards/rejected": -3.4302685260772705, + "step": 723 + }, + { + "epoch": 0.47285492693281084, + "grad_norm": 25.32155426640716, + "learning_rate": 9.503051135061538e-08, + "logits/chosen": -1.5253040790557861, + "logits/rejected": -1.5683470964431763, + "logps/chosen": -827.9707641601562, + "logps/rejected": -843.636962890625, + "loss": 0.4839, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8832221031188965, + "rewards/margins": 0.7474449276924133, + "rewards/rejected": -3.630666732788086, + "step": 724 + }, + { + "epoch": 0.4735080414727733, + "grad_norm": 29.38483596431046, + "learning_rate": 9.486556404895083e-08, + "logits/chosen": -1.4884456396102905, + "logits/rejected": -1.4345768690109253, + "logps/chosen": -738.1644897460938, + "logps/rejected": -887.9599609375, + "loss": 0.5609, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7885422706604004, + "rewards/margins": 1.1899745464324951, + "rewards/rejected": -3.9785170555114746, + "step": 725 + }, + { + "epoch": 0.47416115601273573, + "grad_norm": 10.62636396986744, + "learning_rate": 9.470051334442008e-08, + "logits/chosen": -1.477259874343872, + "logits/rejected": -1.5057060718536377, + "logps/chosen": -770.3033447265625, + "logps/rejected": -863.6229248046875, + "loss": 0.5272, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.945749044418335, + "rewards/margins": 0.7120295763015747, + "rewards/rejected": -3.657778739929199, + "step": 726 + }, + { + "epoch": 0.4748142705526982, + "grad_norm": 17.901405492456256, + "learning_rate": 9.453536009613367e-08, + "logits/chosen": -1.49770987033844, + "logits/rejected": -1.5056265592575073, + "logps/chosen": -741.2904052734375, + "logps/rejected": -932.2896118164062, + "loss": 0.4609, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7706236839294434, + "rewards/margins": 0.9994168281555176, + "rewards/rejected": -3.770040988922119, + "step": 727 + }, + { + "epoch": 0.4754673850926606, + "grad_norm": 12.867336823169316, + "learning_rate": 9.437010516373592e-08, + "logits/chosen": -1.4868743419647217, + "logits/rejected": -1.5124531984329224, + "logps/chosen": -870.4682006835938, + "logps/rejected": -907.4706420898438, + "loss": 0.4915, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0645503997802734, + "rewards/margins": 0.6983194351196289, + "rewards/rejected": -3.7628698348999023, + "step": 728 + }, + { + "epoch": 0.4761204996326231, + "grad_norm": 50.44249826662269, + "learning_rate": 9.420474940740042e-08, + "logits/chosen": -1.635010838508606, + "logits/rejected": -1.6417289972305298, + "logps/chosen": -834.1351318359375, + "logps/rejected": -948.0263061523438, + "loss": 0.5263, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.062831401824951, + "rewards/margins": 0.9943631887435913, + "rewards/rejected": -4.057194709777832, + "step": 729 + }, + { + "epoch": 0.4767736141725855, + "grad_norm": 14.64756449278543, + "learning_rate": 9.403929368782558e-08, + "logits/chosen": -1.5567909479141235, + "logits/rejected": -1.6151758432388306, + "logps/chosen": -768.6214599609375, + "logps/rejected": -870.6768798828125, + "loss": 0.4699, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4706671237945557, + "rewards/margins": 0.9328286647796631, + "rewards/rejected": -3.4034957885742188, + "step": 730 + }, + { + "epoch": 0.477426728712548, + "grad_norm": 13.947518321539896, + "learning_rate": 9.387373886623012e-08, + "logits/chosen": -1.51140558719635, + "logits/rejected": -1.536016583442688, + "logps/chosen": -860.7777709960938, + "logps/rejected": -902.8041381835938, + "loss": 0.52, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0753116607666016, + "rewards/margins": 0.5363348722457886, + "rewards/rejected": -3.6116464138031006, + "step": 731 + }, + { + "epoch": 0.4780798432525104, + "grad_norm": 16.851055770042308, + "learning_rate": 9.37080858043486e-08, + "logits/chosen": -1.4963881969451904, + "logits/rejected": -1.4485998153686523, + "logps/chosen": -813.0571899414062, + "logps/rejected": -845.43408203125, + "loss": 0.519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9917490482330322, + "rewards/margins": 0.81999272108078, + "rewards/rejected": -3.811741352081299, + "step": 732 + }, + { + "epoch": 0.4787329577924729, + "grad_norm": 36.178464401201914, + "learning_rate": 9.354233536442691e-08, + "logits/chosen": -1.5164326429367065, + "logits/rejected": -1.5172199010849, + "logps/chosen": -872.368408203125, + "logps/rejected": -948.313720703125, + "loss": 0.5787, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.244874954223633, + "rewards/margins": 0.6580997705459595, + "rewards/rejected": -3.9029746055603027, + "step": 733 + }, + { + "epoch": 0.4793860723324353, + "grad_norm": 12.484932403976112, + "learning_rate": 9.337648840921784e-08, + "logits/chosen": -1.6157573461532593, + "logits/rejected": -1.5741209983825684, + "logps/chosen": -884.84912109375, + "logps/rejected": -1095.204345703125, + "loss": 0.4598, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.276892900466919, + "rewards/margins": 1.09320068359375, + "rewards/rejected": -4.370093822479248, + "step": 734 + }, + { + "epoch": 0.48003918687239777, + "grad_norm": 14.089376755304505, + "learning_rate": 9.321054580197656e-08, + "logits/chosen": -1.4722511768341064, + "logits/rejected": -1.474959135055542, + "logps/chosen": -730.8760375976562, + "logps/rejected": -825.123779296875, + "loss": 0.5249, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8380813598632812, + "rewards/margins": 0.6124694347381592, + "rewards/rejected": -3.4505507946014404, + "step": 735 + }, + { + "epoch": 0.4806923014123602, + "grad_norm": 16.35406356020656, + "learning_rate": 9.304450840645609e-08, + "logits/chosen": -1.4326503276824951, + "logits/rejected": -1.4325727224349976, + "logps/chosen": -839.5531005859375, + "logps/rejected": -894.9701538085938, + "loss": 0.5027, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.4309120178222656, + "rewards/margins": 0.5214735865592957, + "rewards/rejected": -3.952385663986206, + "step": 736 + }, + { + "epoch": 0.48134541595232266, + "grad_norm": 17.20276393860269, + "learning_rate": 9.287837708690284e-08, + "logits/chosen": -1.5666112899780273, + "logits/rejected": -1.524141550064087, + "logps/chosen": -820.8998413085938, + "logps/rejected": -992.0237426757812, + "loss": 0.4732, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9690146446228027, + "rewards/margins": 0.996252179145813, + "rewards/rejected": -3.9652669429779053, + "step": 737 + }, + { + "epoch": 0.4819985304922851, + "grad_norm": 43.29010354161253, + "learning_rate": 9.271215270805212e-08, + "logits/chosen": -1.596786379814148, + "logits/rejected": -1.5615063905715942, + "logps/chosen": -804.3712768554688, + "logps/rejected": -849.1760864257812, + "loss": 0.56, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8177123069763184, + "rewards/margins": 0.5579187273979187, + "rewards/rejected": -3.3756308555603027, + "step": 738 + }, + { + "epoch": 0.48265164503224756, + "grad_norm": 15.346572016217646, + "learning_rate": 9.254583613512365e-08, + "logits/chosen": -1.6470632553100586, + "logits/rejected": -1.6171380281448364, + "logps/chosen": -913.9931640625, + "logps/rejected": -999.4883422851562, + "loss": 0.4807, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2777669429779053, + "rewards/margins": 0.7959187626838684, + "rewards/rejected": -4.073686122894287, + "step": 739 + }, + { + "epoch": 0.48330475957221, + "grad_norm": 9.556534348064194, + "learning_rate": 9.237942823381696e-08, + "logits/chosen": -1.573829174041748, + "logits/rejected": -1.5064740180969238, + "logps/chosen": -777.1859741210938, + "logps/rejected": -792.814208984375, + "loss": 0.469, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.7544057369232178, + "rewards/margins": 0.8636622428894043, + "rewards/rejected": -3.618068218231201, + "step": 740 + }, + { + "epoch": 0.48395787411217245, + "grad_norm": 25.011865185818184, + "learning_rate": 9.221292987030702e-08, + "logits/chosen": -1.5379745960235596, + "logits/rejected": -1.4926586151123047, + "logps/chosen": -820.5584716796875, + "logps/rejected": -835.9033203125, + "loss": 0.4647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9160995483398438, + "rewards/margins": 0.7988985180854797, + "rewards/rejected": -3.7149980068206787, + "step": 741 + }, + { + "epoch": 0.48461098865213487, + "grad_norm": 13.575787231912816, + "learning_rate": 9.204634191123965e-08, + "logits/chosen": -1.508238673210144, + "logits/rejected": -1.4783196449279785, + "logps/chosen": -902.4708251953125, + "logps/rejected": -893.9943237304688, + "loss": 0.5139, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.170164108276367, + "rewards/margins": 0.5966829061508179, + "rewards/rejected": -3.7668471336364746, + "step": 742 + }, + { + "epoch": 0.48526410319209734, + "grad_norm": 15.830542176100348, + "learning_rate": 9.187966522372705e-08, + "logits/chosen": -1.534610390663147, + "logits/rejected": -1.5084702968597412, + "logps/chosen": -787.0113525390625, + "logps/rejected": -803.7102661132812, + "loss": 0.5124, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9289166927337646, + "rewards/margins": 0.520931601524353, + "rewards/rejected": -3.4498486518859863, + "step": 743 + }, + { + "epoch": 0.48591721773205976, + "grad_norm": 21.140461323350923, + "learning_rate": 9.17129006753432e-08, + "logits/chosen": -1.525702714920044, + "logits/rejected": -1.5293265581130981, + "logps/chosen": -725.1962280273438, + "logps/rejected": -809.5682983398438, + "loss": 0.4683, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.3667490482330322, + "rewards/margins": 0.9049954414367676, + "rewards/rejected": -3.271744728088379, + "step": 744 + }, + { + "epoch": 0.48657033227202223, + "grad_norm": 38.71867984311598, + "learning_rate": 9.154604913411943e-08, + "logits/chosen": -1.5762661695480347, + "logits/rejected": -1.5938408374786377, + "logps/chosen": -876.135986328125, + "logps/rejected": -978.6011962890625, + "loss": 0.4793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.281604528427124, + "rewards/margins": 0.8068190813064575, + "rewards/rejected": -4.088423252105713, + "step": 745 + }, + { + "epoch": 0.48722344681198465, + "grad_norm": 29.90445121131915, + "learning_rate": 9.137911146853995e-08, + "logits/chosen": -1.5684157609939575, + "logits/rejected": -1.5826231241226196, + "logps/chosen": -795.3310546875, + "logps/rejected": -863.130859375, + "loss": 0.572, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2306532859802246, + "rewards/margins": 0.5794581770896912, + "rewards/rejected": -3.8101112842559814, + "step": 746 + }, + { + "epoch": 0.4878765613519471, + "grad_norm": 20.953402023971254, + "learning_rate": 9.121208854753716e-08, + "logits/chosen": -1.511757493019104, + "logits/rejected": -1.4341886043548584, + "logps/chosen": -716.0580444335938, + "logps/rejected": -796.005859375, + "loss": 0.5049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9407427310943604, + "rewards/margins": 0.7499470114707947, + "rewards/rejected": -3.6906895637512207, + "step": 747 + }, + { + "epoch": 0.48852967589190954, + "grad_norm": 23.63758752296789, + "learning_rate": 9.10449812404873e-08, + "logits/chosen": -1.5069888830184937, + "logits/rejected": -1.537886142730713, + "logps/chosen": -781.0473022460938, + "logps/rejected": -866.6488037109375, + "loss": 0.4737, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.995260000228882, + "rewards/margins": 0.538629412651062, + "rewards/rejected": -3.5338892936706543, + "step": 748 + }, + { + "epoch": 0.489182790431872, + "grad_norm": 9.601439395523968, + "learning_rate": 9.087779041720581e-08, + "logits/chosen": -1.5230836868286133, + "logits/rejected": -1.4928765296936035, + "logps/chosen": -708.7151489257812, + "logps/rejected": -755.310302734375, + "loss": 0.4822, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.70585560798645, + "rewards/margins": 0.5378022789955139, + "rewards/rejected": -3.2436575889587402, + "step": 749 + }, + { + "epoch": 0.48983590497183443, + "grad_norm": 52.26590360034194, + "learning_rate": 9.071051694794283e-08, + "logits/chosen": -1.546759843826294, + "logits/rejected": -1.4855560064315796, + "logps/chosen": -800.2686767578125, + "logps/rejected": -858.6179809570312, + "loss": 0.5451, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1868784427642822, + "rewards/margins": 0.29365602135658264, + "rewards/rejected": -3.480534553527832, + "step": 750 + }, + { + "epoch": 0.4904890195117969, + "grad_norm": 27.46647480470444, + "learning_rate": 9.054316170337872e-08, + "logits/chosen": -1.5644291639328003, + "logits/rejected": -1.5516630411148071, + "logps/chosen": -817.9088745117188, + "logps/rejected": -884.1804809570312, + "loss": 0.4977, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8426513671875, + "rewards/margins": 0.5670284032821655, + "rewards/rejected": -3.409679412841797, + "step": 751 + }, + { + "epoch": 0.4911421340517593, + "grad_norm": 22.65417735488772, + "learning_rate": 9.037572555461949e-08, + "logits/chosen": -1.4849984645843506, + "logits/rejected": -1.5145663022994995, + "logps/chosen": -797.359619140625, + "logps/rejected": -906.737548828125, + "loss": 0.4895, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.670722007751465, + "rewards/margins": 0.6933446526527405, + "rewards/rejected": -3.3640666007995605, + "step": 752 + }, + { + "epoch": 0.4917952485917218, + "grad_norm": 13.411177355069162, + "learning_rate": 9.020820937319222e-08, + "logits/chosen": -1.5731292963027954, + "logits/rejected": -1.5484392642974854, + "logps/chosen": -884.296630859375, + "logps/rejected": -940.8453979492188, + "loss": 0.471, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.245163679122925, + "rewards/margins": 0.7693843245506287, + "rewards/rejected": -4.014548301696777, + "step": 753 + }, + { + "epoch": 0.4924483631316842, + "grad_norm": 22.100162564155266, + "learning_rate": 9.004061403104063e-08, + "logits/chosen": -1.5291342735290527, + "logits/rejected": -1.541631817817688, + "logps/chosen": -894.0836181640625, + "logps/rejected": -972.8955688476562, + "loss": 0.4913, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.270658016204834, + "rewards/margins": 0.9292981028556824, + "rewards/rejected": -4.19995641708374, + "step": 754 + }, + { + "epoch": 0.49310147767164664, + "grad_norm": 40.53277025479099, + "learning_rate": 8.987294040052048e-08, + "logits/chosen": -1.513358235359192, + "logits/rejected": -1.4985860586166382, + "logps/chosen": -815.3209838867188, + "logps/rejected": -845.5489501953125, + "loss": 0.5791, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.830094337463379, + "rewards/margins": 0.6605682373046875, + "rewards/rejected": -3.4906625747680664, + "step": 755 + }, + { + "epoch": 0.4937545922116091, + "grad_norm": 52.36436157875439, + "learning_rate": 8.970518935439494e-08, + "logits/chosen": -1.5312970876693726, + "logits/rejected": -1.536803126335144, + "logps/chosen": -841.9302978515625, + "logps/rejected": -1007.2651977539062, + "loss": 0.52, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.154250383377075, + "rewards/margins": 0.8153612017631531, + "rewards/rejected": -3.9696109294891357, + "step": 756 + }, + { + "epoch": 0.49440770675157153, + "grad_norm": 10.906915240973083, + "learning_rate": 8.953736176583024e-08, + "logits/chosen": -1.5253336429595947, + "logits/rejected": -1.533002495765686, + "logps/chosen": -844.7198486328125, + "logps/rejected": -904.4848022460938, + "loss": 0.5255, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.983484983444214, + "rewards/margins": 0.8254218101501465, + "rewards/rejected": -3.8089070320129395, + "step": 757 + }, + { + "epoch": 0.495060821291534, + "grad_norm": 21.62178976119015, + "learning_rate": 8.936945850839103e-08, + "logits/chosen": -1.5330315828323364, + "logits/rejected": -1.5863041877746582, + "logps/chosen": -836.3416137695312, + "logps/rejected": -919.7655029296875, + "loss": 0.5125, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1585867404937744, + "rewards/margins": 0.9332132935523987, + "rewards/rejected": -4.091800212860107, + "step": 758 + }, + { + "epoch": 0.4957139358314964, + "grad_norm": 59.346179693747, + "learning_rate": 8.920148045603571e-08, + "logits/chosen": -1.4187573194503784, + "logits/rejected": -1.437163233757019, + "logps/chosen": -740.33837890625, + "logps/rejected": -822.4481811523438, + "loss": 0.4519, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.87211275100708, + "rewards/margins": 0.8624040484428406, + "rewards/rejected": -3.7345166206359863, + "step": 759 + }, + { + "epoch": 0.4963670503714589, + "grad_norm": 17.104210267847428, + "learning_rate": 8.903342848311213e-08, + "logits/chosen": -1.4946515560150146, + "logits/rejected": -1.511128306388855, + "logps/chosen": -787.0501098632812, + "logps/rejected": -832.3441162109375, + "loss": 0.5147, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0261549949645996, + "rewards/margins": 0.5119923949241638, + "rewards/rejected": -3.538147211074829, + "step": 760 + }, + { + "epoch": 0.4970201649114213, + "grad_norm": 37.01444427857606, + "learning_rate": 8.886530346435281e-08, + "logits/chosen": -1.5614135265350342, + "logits/rejected": -1.5467019081115723, + "logps/chosen": -809.6796264648438, + "logps/rejected": -850.0182495117188, + "loss": 0.5183, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.892712116241455, + "rewards/margins": 0.5590572357177734, + "rewards/rejected": -3.4517693519592285, + "step": 761 + }, + { + "epoch": 0.4976732794513838, + "grad_norm": 20.36864614388382, + "learning_rate": 8.869710627487057e-08, + "logits/chosen": -1.5446330308914185, + "logits/rejected": -1.5651966333389282, + "logps/chosen": -765.47412109375, + "logps/rejected": -840.524169921875, + "loss": 0.5331, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1215081214904785, + "rewards/margins": 0.7881425619125366, + "rewards/rejected": -3.9096505641937256, + "step": 762 + }, + { + "epoch": 0.4983263939913462, + "grad_norm": 88.03942880437029, + "learning_rate": 8.852883779015377e-08, + "logits/chosen": -1.5617398023605347, + "logits/rejected": -1.5589573383331299, + "logps/chosen": -719.5538940429688, + "logps/rejected": -807.90869140625, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8834686279296875, + "rewards/margins": 0.5054572820663452, + "rewards/rejected": -3.3889260292053223, + "step": 763 + }, + { + "epoch": 0.4989795085313087, + "grad_norm": 85.24504779473465, + "learning_rate": 8.836049888606199e-08, + "logits/chosen": -1.5608245134353638, + "logits/rejected": -1.5477367639541626, + "logps/chosen": -867.1178588867188, + "logps/rejected": -1046.8099365234375, + "loss": 0.4946, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.545870304107666, + "rewards/margins": 1.3517121076583862, + "rewards/rejected": -4.897582530975342, + "step": 764 + }, + { + "epoch": 0.4996326230712711, + "grad_norm": 103.8676814765994, + "learning_rate": 8.819209043882131e-08, + "logits/chosen": -1.5328567028045654, + "logits/rejected": -1.4881083965301514, + "logps/chosen": -879.6492919921875, + "logps/rejected": -995.6163940429688, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1603341102600098, + "rewards/margins": 1.162267804145813, + "rewards/rejected": -4.322602272033691, + "step": 765 + }, + { + "epoch": 0.5002857376112335, + "grad_norm": 11.728242486539118, + "learning_rate": 8.802361332501978e-08, + "logits/chosen": -1.5301557779312134, + "logits/rejected": -1.5447125434875488, + "logps/chosen": -793.1903076171875, + "logps/rejected": -930.641357421875, + "loss": 0.4803, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.126751184463501, + "rewards/margins": 0.9962693452835083, + "rewards/rejected": -4.123020648956299, + "step": 766 + }, + { + "epoch": 0.500938852151196, + "grad_norm": 10.020322944474998, + "learning_rate": 8.785506842160285e-08, + "logits/chosen": -1.5191065073013306, + "logits/rejected": -1.5158594846725464, + "logps/chosen": -775.5346069335938, + "logps/rejected": -828.55126953125, + "loss": 0.4989, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9595863819122314, + "rewards/margins": 0.6543839573860168, + "rewards/rejected": -3.6139702796936035, + "step": 767 + }, + { + "epoch": 0.5015919666911585, + "grad_norm": 19.19484359451946, + "learning_rate": 8.768645660586886e-08, + "logits/chosen": -1.5571017265319824, + "logits/rejected": -1.599165439605713, + "logps/chosen": -870.1515502929688, + "logps/rejected": -1006.7691650390625, + "loss": 0.4255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.215618848800659, + "rewards/margins": 1.278153657913208, + "rewards/rejected": -4.493772506713867, + "step": 768 + }, + { + "epoch": 0.5022450812311209, + "grad_norm": 55.25909288858508, + "learning_rate": 8.751777875546442e-08, + "logits/chosen": -1.5235897302627563, + "logits/rejected": -1.5015480518341064, + "logps/chosen": -766.0463256835938, + "logps/rejected": -827.17919921875, + "loss": 0.5069, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9575929641723633, + "rewards/margins": 0.6264766454696655, + "rewards/rejected": -3.5840699672698975, + "step": 769 + }, + { + "epoch": 0.5028981957710833, + "grad_norm": 31.7480045035506, + "learning_rate": 8.734903574837985e-08, + "logits/chosen": -1.4985229969024658, + "logits/rejected": -1.4879060983657837, + "logps/chosen": -830.8861083984375, + "logps/rejected": -901.6783447265625, + "loss": 0.5273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9937520027160645, + "rewards/margins": 0.953312337398529, + "rewards/rejected": -3.9470643997192383, + "step": 770 + }, + { + "epoch": 0.5035513103110458, + "grad_norm": 23.133647048008825, + "learning_rate": 8.718022846294466e-08, + "logits/chosen": -1.4812209606170654, + "logits/rejected": -1.4926509857177734, + "logps/chosen": -764.713134765625, + "logps/rejected": -867.8782348632812, + "loss": 0.4615, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.96504807472229, + "rewards/margins": 0.7924860715866089, + "rewards/rejected": -3.7575340270996094, + "step": 771 + }, + { + "epoch": 0.5042044248510082, + "grad_norm": 13.218818964781667, + "learning_rate": 8.701135777782291e-08, + "logits/chosen": -1.5664947032928467, + "logits/rejected": -1.5273160934448242, + "logps/chosen": -843.0235595703125, + "logps/rejected": -841.0691528320312, + "loss": 0.6184, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2560126781463623, + "rewards/margins": 0.49974071979522705, + "rewards/rejected": -3.755753517150879, + "step": 772 + }, + { + "epoch": 0.5048575393909707, + "grad_norm": 29.96826439077636, + "learning_rate": 8.684242457200865e-08, + "logits/chosen": -1.4153635501861572, + "logits/rejected": -1.4303746223449707, + "logps/chosen": -740.8873291015625, + "logps/rejected": -893.1712646484375, + "loss": 0.4935, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7135632038116455, + "rewards/margins": 1.0459972620010376, + "rewards/rejected": -3.7595603466033936, + "step": 773 + }, + { + "epoch": 0.5055106539309331, + "grad_norm": 17.20275448114271, + "learning_rate": 8.667342972482136e-08, + "logits/chosen": -1.5214056968688965, + "logits/rejected": -1.5121796131134033, + "logps/chosen": -887.89794921875, + "logps/rejected": -939.7318115234375, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.212088108062744, + "rewards/margins": 0.7043830156326294, + "rewards/rejected": -3.916471481323242, + "step": 774 + }, + { + "epoch": 0.5061637684708956, + "grad_norm": 44.42583289533232, + "learning_rate": 8.650437411590141e-08, + "logits/chosen": -1.5345828533172607, + "logits/rejected": -1.5515172481536865, + "logps/chosen": -841.9071044921875, + "logps/rejected": -872.3067626953125, + "loss": 0.5004, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2682528495788574, + "rewards/margins": 0.5741406679153442, + "rewards/rejected": -3.842393636703491, + "step": 775 + }, + { + "epoch": 0.506816883010858, + "grad_norm": 11.651543725419435, + "learning_rate": 8.633525862520538e-08, + "logits/chosen": -1.5557384490966797, + "logits/rejected": -1.5258511304855347, + "logps/chosen": -821.5269165039062, + "logps/rejected": -849.76611328125, + "loss": 0.5619, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8014473915100098, + "rewards/margins": 0.6341366171836853, + "rewards/rejected": -3.435584306716919, + "step": 776 + }, + { + "epoch": 0.5074699975508205, + "grad_norm": 46.00900328788502, + "learning_rate": 8.616608413300162e-08, + "logits/chosen": -1.465898871421814, + "logits/rejected": -1.5084877014160156, + "logps/chosen": -825.7061767578125, + "logps/rejected": -899.7260131835938, + "loss": 0.5337, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.140413284301758, + "rewards/margins": 0.8182295560836792, + "rewards/rejected": -3.9586427211761475, + "step": 777 + }, + { + "epoch": 0.5081231120907829, + "grad_norm": 34.00237370143665, + "learning_rate": 8.599685151986555e-08, + "logits/chosen": -1.5190675258636475, + "logits/rejected": -1.5114282369613647, + "logps/chosen": -788.9437255859375, + "logps/rejected": -859.0122680664062, + "loss": 0.5124, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8215911388397217, + "rewards/margins": 0.813235878944397, + "rewards/rejected": -3.634827136993408, + "step": 778 + }, + { + "epoch": 0.5087762266307454, + "grad_norm": 11.243459420945996, + "learning_rate": 8.582756166667506e-08, + "logits/chosen": -1.523029088973999, + "logits/rejected": -1.5339922904968262, + "logps/chosen": -776.9263916015625, + "logps/rejected": -869.6576538085938, + "loss": 0.4985, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9392380714416504, + "rewards/margins": 0.5630586743354797, + "rewards/rejected": -3.5022971630096436, + "step": 779 + }, + { + "epoch": 0.5094293411707078, + "grad_norm": 11.246859304420749, + "learning_rate": 8.565821545460607e-08, + "logits/chosen": -1.5100888013839722, + "logits/rejected": -1.5363471508026123, + "logps/chosen": -822.7084350585938, + "logps/rejected": -870.5260009765625, + "loss": 0.4704, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.991995334625244, + "rewards/margins": 0.8592602610588074, + "rewards/rejected": -3.851255416870117, + "step": 780 + }, + { + "epoch": 0.5100824557106702, + "grad_norm": 22.710318086337125, + "learning_rate": 8.548881376512784e-08, + "logits/chosen": -1.55587899684906, + "logits/rejected": -1.5447403192520142, + "logps/chosen": -911.83935546875, + "logps/rejected": -1001.2164916992188, + "loss": 0.5482, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.194542407989502, + "rewards/margins": 1.0161552429199219, + "rewards/rejected": -4.210697174072266, + "step": 781 + }, + { + "epoch": 0.5107355702506327, + "grad_norm": 30.493025252100427, + "learning_rate": 8.531935747999837e-08, + "logits/chosen": -1.5483152866363525, + "logits/rejected": -1.5260441303253174, + "logps/chosen": -825.1983642578125, + "logps/rejected": -968.305419921875, + "loss": 0.4986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9049713611602783, + "rewards/margins": 0.961846113204956, + "rewards/rejected": -3.8668179512023926, + "step": 782 + }, + { + "epoch": 0.5113886847905952, + "grad_norm": 11.07008824806609, + "learning_rate": 8.514984748125984e-08, + "logits/chosen": -1.4909794330596924, + "logits/rejected": -1.5248680114746094, + "logps/chosen": -831.9634399414062, + "logps/rejected": -922.552978515625, + "loss": 0.4796, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.975536346435547, + "rewards/margins": 1.0760196447372437, + "rewards/rejected": -4.051555633544922, + "step": 783 + }, + { + "epoch": 0.5120417993305576, + "grad_norm": 31.858032148945103, + "learning_rate": 8.498028465123402e-08, + "logits/chosen": -1.4506326913833618, + "logits/rejected": -1.4219865798950195, + "logps/chosen": -696.5535278320312, + "logps/rejected": -853.2267456054688, + "loss": 0.4794, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.721505641937256, + "rewards/margins": 1.1351335048675537, + "rewards/rejected": -3.8566393852233887, + "step": 784 + }, + { + "epoch": 0.51269491387052, + "grad_norm": 67.88266427137975, + "learning_rate": 8.48106698725177e-08, + "logits/chosen": -1.5079140663146973, + "logits/rejected": -1.5019490718841553, + "logps/chosen": -861.70458984375, + "logps/rejected": -1028.7357177734375, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1499381065368652, + "rewards/margins": 1.0946592092514038, + "rewards/rejected": -4.244597434997559, + "step": 785 + }, + { + "epoch": 0.5133480284104824, + "grad_norm": 77.6759127035247, + "learning_rate": 8.464100402797803e-08, + "logits/chosen": -1.4545526504516602, + "logits/rejected": -1.4197547435760498, + "logps/chosen": -805.7166137695312, + "logps/rejected": -865.64501953125, + "loss": 0.5099, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2749907970428467, + "rewards/margins": 0.8093310594558716, + "rewards/rejected": -4.084321975708008, + "step": 786 + }, + { + "epoch": 0.514001142950445, + "grad_norm": 16.83703843955555, + "learning_rate": 8.4471288000748e-08, + "logits/chosen": -1.5386128425598145, + "logits/rejected": -1.5408875942230225, + "logps/chosen": -842.3564453125, + "logps/rejected": -944.2308959960938, + "loss": 0.5631, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.174443483352661, + "rewards/margins": 0.8851109147071838, + "rewards/rejected": -4.059554576873779, + "step": 787 + }, + { + "epoch": 0.5146542574904074, + "grad_norm": 31.817856468339304, + "learning_rate": 8.430152267422177e-08, + "logits/chosen": -1.5486061573028564, + "logits/rejected": -1.53328537940979, + "logps/chosen": -797.2845458984375, + "logps/rejected": -912.8262329101562, + "loss": 0.5719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.162616729736328, + "rewards/margins": 0.9437724947929382, + "rewards/rejected": -4.106389045715332, + "step": 788 + }, + { + "epoch": 0.5153073720303698, + "grad_norm": 32.1546133597687, + "learning_rate": 8.413170893205015e-08, + "logits/chosen": -1.5652754306793213, + "logits/rejected": -1.5659431219100952, + "logps/chosen": -796.43994140625, + "logps/rejected": -850.5557861328125, + "loss": 0.5232, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.044846773147583, + "rewards/margins": 0.7479317784309387, + "rewards/rejected": -3.792778730392456, + "step": 789 + }, + { + "epoch": 0.5159604865703322, + "grad_norm": 11.020829449688339, + "learning_rate": 8.396184765813591e-08, + "logits/chosen": -1.5715382099151611, + "logits/rejected": -1.5400923490524292, + "logps/chosen": -811.490234375, + "logps/rejected": -932.505615234375, + "loss": 0.5092, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.054403781890869, + "rewards/margins": 1.0824800729751587, + "rewards/rejected": -4.136883735656738, + "step": 790 + }, + { + "epoch": 0.5166136011102948, + "grad_norm": 30.888894963547546, + "learning_rate": 8.379193973662927e-08, + "logits/chosen": -1.5172935724258423, + "logits/rejected": -1.5318307876586914, + "logps/chosen": -823.394287109375, + "logps/rejected": -943.768310546875, + "loss": 0.498, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9745614528656006, + "rewards/margins": 1.0555309057235718, + "rewards/rejected": -4.030092716217041, + "step": 791 + }, + { + "epoch": 0.5172667156502572, + "grad_norm": 29.8842541290539, + "learning_rate": 8.362198605192326e-08, + "logits/chosen": -1.557112693786621, + "logits/rejected": -1.5465399026870728, + "logps/chosen": -826.0409545898438, + "logps/rejected": -961.9547119140625, + "loss": 0.4962, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1106011867523193, + "rewards/margins": 0.760248601436615, + "rewards/rejected": -3.870850086212158, + "step": 792 + }, + { + "epoch": 0.5179198301902196, + "grad_norm": 25.848000115396033, + "learning_rate": 8.345198748864909e-08, + "logits/chosen": -1.4976153373718262, + "logits/rejected": -1.4590017795562744, + "logps/chosen": -834.5736083984375, + "logps/rejected": -815.7548217773438, + "loss": 0.5151, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.545506477355957, + "rewards/margins": 0.19579878449440002, + "rewards/rejected": -3.741305351257324, + "step": 793 + }, + { + "epoch": 0.518572944730182, + "grad_norm": 22.608186681230375, + "learning_rate": 8.328194493167156e-08, + "logits/chosen": -1.5127606391906738, + "logits/rejected": -1.5453428030014038, + "logps/chosen": -874.2019653320312, + "logps/rejected": -923.8192749023438, + "loss": 0.5177, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3078603744506836, + "rewards/margins": 0.7927409410476685, + "rewards/rejected": -4.1006011962890625, + "step": 794 + }, + { + "epoch": 0.5192260592701445, + "grad_norm": 14.489367562787846, + "learning_rate": 8.311185926608451e-08, + "logits/chosen": -1.50680673122406, + "logits/rejected": -1.484675645828247, + "logps/chosen": -844.3919067382812, + "logps/rejected": -819.8763427734375, + "loss": 0.5177, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.12322998046875, + "rewards/margins": 0.5924826860427856, + "rewards/rejected": -3.715712785720825, + "step": 795 + }, + { + "epoch": 0.519879173810107, + "grad_norm": 11.919265518405389, + "learning_rate": 8.29417313772061e-08, + "logits/chosen": -1.5627418756484985, + "logits/rejected": -1.580378770828247, + "logps/chosen": -855.9948120117188, + "logps/rejected": -1031.594970703125, + "loss": 0.5956, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3382222652435303, + "rewards/margins": 0.9997389912605286, + "rewards/rejected": -4.337961673736572, + "step": 796 + }, + { + "epoch": 0.5205322883500694, + "grad_norm": 28.37261621955881, + "learning_rate": 8.277156215057434e-08, + "logits/chosen": -1.4434901475906372, + "logits/rejected": -1.4327919483184814, + "logps/chosen": -895.87646484375, + "logps/rejected": -958.2759399414062, + "loss": 0.4678, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.354534864425659, + "rewards/margins": 0.7718441486358643, + "rewards/rejected": -4.126379013061523, + "step": 797 + }, + { + "epoch": 0.5211854028900318, + "grad_norm": 22.35879028875356, + "learning_rate": 8.260135247194235e-08, + "logits/chosen": -1.539190411567688, + "logits/rejected": -1.5733530521392822, + "logps/chosen": -710.5164794921875, + "logps/rejected": -795.2035522460938, + "loss": 0.4548, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8853349685668945, + "rewards/margins": 0.540171205997467, + "rewards/rejected": -3.425506353378296, + "step": 798 + }, + { + "epoch": 0.5218385174299943, + "grad_norm": 10.507592796277946, + "learning_rate": 8.243110322727382e-08, + "logits/chosen": -1.5344185829162598, + "logits/rejected": -1.5373344421386719, + "logps/chosen": -755.857177734375, + "logps/rejected": -858.6672973632812, + "loss": 0.4848, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8476948738098145, + "rewards/margins": 0.9375979900360107, + "rewards/rejected": -3.785292863845825, + "step": 799 + }, + { + "epoch": 0.5224916319699567, + "grad_norm": 33.61581825469172, + "learning_rate": 8.226081530273843e-08, + "logits/chosen": -1.557815670967102, + "logits/rejected": -1.561926007270813, + "logps/chosen": -820.7285766601562, + "logps/rejected": -851.3828125, + "loss": 0.511, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0852344036102295, + "rewards/margins": 0.7549958825111389, + "rewards/rejected": -3.8402299880981445, + "step": 800 + }, + { + "epoch": 0.5224916319699567, + "eval_logits/chosen": -1.527337670326233, + "eval_logits/rejected": -1.5160092115402222, + "eval_logps/chosen": -815.5145263671875, + "eval_logps/rejected": -886.17138671875, + "eval_loss": 0.5177087187767029, + "eval_rewards/accuracies": 0.7400000095367432, + "eval_rewards/chosen": -3.0516912937164307, + "eval_rewards/margins": 0.7876277565956116, + "eval_rewards/rejected": -3.8393189907073975, + "eval_runtime": 296.7047, + "eval_samples_per_second": 13.481, + "eval_steps_per_second": 0.843, + "step": 800 + }, + { + "epoch": 0.5231447465099192, + "grad_norm": 29.72721689887477, + "learning_rate": 8.209048958470714e-08, + "logits/chosen": -1.5397412776947021, + "logits/rejected": -1.573317050933838, + "logps/chosen": -794.4456176757812, + "logps/rejected": -861.4932861328125, + "loss": 0.4681, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1979122161865234, + "rewards/margins": 0.44129082560539246, + "rewards/rejected": -3.6392035484313965, + "step": 801 + }, + { + "epoch": 0.5237978610498816, + "grad_norm": 24.93121116287881, + "learning_rate": 8.192012695974765e-08, + "logits/chosen": -1.440004587173462, + "logits/rejected": -1.4455904960632324, + "logps/chosen": -756.099365234375, + "logps/rejected": -812.0906372070312, + "loss": 0.4259, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7917063236236572, + "rewards/margins": 0.8878428936004639, + "rewards/rejected": -3.6795494556427, + "step": 802 + }, + { + "epoch": 0.5244509755898441, + "grad_norm": 21.486488987183108, + "learning_rate": 8.174972831461975e-08, + "logits/chosen": -1.4720410108566284, + "logits/rejected": -1.4299757480621338, + "logps/chosen": -797.6776733398438, + "logps/rejected": -876.0466918945312, + "loss": 0.5424, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.303070545196533, + "rewards/margins": 0.6923160552978516, + "rewards/rejected": -3.9953863620758057, + "step": 803 + }, + { + "epoch": 0.5251040901298065, + "grad_norm": 15.956985029082132, + "learning_rate": 8.157929453627079e-08, + "logits/chosen": -1.528617262840271, + "logits/rejected": -1.543229579925537, + "logps/chosen": -814.8829345703125, + "logps/rejected": -840.0612182617188, + "loss": 0.5554, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0760016441345215, + "rewards/margins": 0.729838490486145, + "rewards/rejected": -3.805840253829956, + "step": 804 + }, + { + "epoch": 0.5257572046697689, + "grad_norm": 14.955725644267702, + "learning_rate": 8.140882651183087e-08, + "logits/chosen": -1.4945454597473145, + "logits/rejected": -1.50547194480896, + "logps/chosen": -847.5591430664062, + "logps/rejected": -860.2355346679688, + "loss": 0.5914, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3800101280212402, + "rewards/margins": 0.4141124486923218, + "rewards/rejected": -3.7941229343414307, + "step": 805 + }, + { + "epoch": 0.5264103192097314, + "grad_norm": 22.99418263109128, + "learning_rate": 8.123832512860848e-08, + "logits/chosen": -1.5119673013687134, + "logits/rejected": -1.5116000175476074, + "logps/chosen": -837.5704956054688, + "logps/rejected": -927.5317993164062, + "loss": 0.5061, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0885889530181885, + "rewards/margins": 0.9447399377822876, + "rewards/rejected": -4.033329010009766, + "step": 806 + }, + { + "epoch": 0.5270634337496939, + "grad_norm": 18.701086163571972, + "learning_rate": 8.106779127408563e-08, + "logits/chosen": -1.4224164485931396, + "logits/rejected": -1.4035032987594604, + "logps/chosen": -733.4963989257812, + "logps/rejected": -846.0479736328125, + "loss": 0.5246, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.811711072921753, + "rewards/margins": 0.7736900448799133, + "rewards/rejected": -3.5854010581970215, + "step": 807 + }, + { + "epoch": 0.5277165482896563, + "grad_norm": 40.22777398303923, + "learning_rate": 8.08972258359134e-08, + "logits/chosen": -1.497331142425537, + "logits/rejected": -1.5132163763046265, + "logps/chosen": -791.0665893554688, + "logps/rejected": -848.4359130859375, + "loss": 0.5209, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.804387092590332, + "rewards/margins": 0.7790587544441223, + "rewards/rejected": -3.5834455490112305, + "step": 808 + }, + { + "epoch": 0.5283696628296187, + "grad_norm": 28.929510873590274, + "learning_rate": 8.07266297019073e-08, + "logits/chosen": -1.5387717485427856, + "logits/rejected": -1.5651496648788452, + "logps/chosen": -821.0679321289062, + "logps/rejected": -911.5535278320312, + "loss": 0.4894, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0287632942199707, + "rewards/margins": 0.7187597155570984, + "rewards/rejected": -3.747523307800293, + "step": 809 + }, + { + "epoch": 0.5290227773695811, + "grad_norm": 45.22369470855717, + "learning_rate": 8.055600376004255e-08, + "logits/chosen": -1.5283149480819702, + "logits/rejected": -1.520777940750122, + "logps/chosen": -786.1401977539062, + "logps/rejected": -925.03125, + "loss": 0.5102, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.684399366378784, + "rewards/margins": 1.1089178323745728, + "rewards/rejected": -3.7933173179626465, + "step": 810 + }, + { + "epoch": 0.5296758919095437, + "grad_norm": 42.19326253786495, + "learning_rate": 8.038534889844956e-08, + "logits/chosen": -1.6188210248947144, + "logits/rejected": -1.5729517936706543, + "logps/chosen": -868.0130004882812, + "logps/rejected": -917.4168090820312, + "loss": 0.5735, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.09293270111084, + "rewards/margins": 1.0333658456802368, + "rewards/rejected": -4.126298904418945, + "step": 811 + }, + { + "epoch": 0.5303290064495061, + "grad_norm": 117.37707476367844, + "learning_rate": 8.021466600540928e-08, + "logits/chosen": -1.5684051513671875, + "logits/rejected": -1.5472626686096191, + "logps/chosen": -852.613525390625, + "logps/rejected": -913.04833984375, + "loss": 0.5436, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4583096504211426, + "rewards/margins": 0.717620313167572, + "rewards/rejected": -4.175930023193359, + "step": 812 + }, + { + "epoch": 0.5309821209894685, + "grad_norm": 28.12903118620616, + "learning_rate": 8.004395596934856e-08, + "logits/chosen": -1.4949054718017578, + "logits/rejected": -1.4636915922164917, + "logps/chosen": -733.1012573242188, + "logps/rejected": -767.9586181640625, + "loss": 0.5379, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7585432529449463, + "rewards/margins": 0.46337318420410156, + "rewards/rejected": -3.221916675567627, + "step": 813 + }, + { + "epoch": 0.5316352355294309, + "grad_norm": 9.221461814471073, + "learning_rate": 7.987321967883549e-08, + "logits/chosen": -1.6841922998428345, + "logits/rejected": -1.6034355163574219, + "logps/chosen": -839.085205078125, + "logps/rejected": -876.251953125, + "loss": 0.5158, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.193796396255493, + "rewards/margins": 0.6729452610015869, + "rewards/rejected": -3.866741418838501, + "step": 814 + }, + { + "epoch": 0.5322883500693935, + "grad_norm": 10.618048036543714, + "learning_rate": 7.970245802257487e-08, + "logits/chosen": -1.6014914512634277, + "logits/rejected": -1.5899266004562378, + "logps/chosen": -847.2337646484375, + "logps/rejected": -908.861572265625, + "loss": 0.5192, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.173642158508301, + "rewards/margins": 0.662325382232666, + "rewards/rejected": -3.835967540740967, + "step": 815 + }, + { + "epoch": 0.5329414646093559, + "grad_norm": 24.838517311372605, + "learning_rate": 7.953167188940353e-08, + "logits/chosen": -1.531007170677185, + "logits/rejected": -1.5374797582626343, + "logps/chosen": -871.73291015625, + "logps/rejected": -1004.2645874023438, + "loss": 0.4973, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.170253276824951, + "rewards/margins": 0.9588162899017334, + "rewards/rejected": -4.1290693283081055, + "step": 816 + }, + { + "epoch": 0.5335945791493183, + "grad_norm": 101.58547120360855, + "learning_rate": 7.936086216828568e-08, + "logits/chosen": -1.5855270624160767, + "logits/rejected": -1.600976586341858, + "logps/chosen": -784.5303955078125, + "logps/rejected": -773.5015258789062, + "loss": 0.4903, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.953719139099121, + "rewards/margins": 0.48683837056159973, + "rewards/rejected": -3.4405574798583984, + "step": 817 + }, + { + "epoch": 0.5342476936892807, + "grad_norm": 39.27500759821802, + "learning_rate": 7.919002974830833e-08, + "logits/chosen": -1.577518105506897, + "logits/rejected": -1.5548105239868164, + "logps/chosen": -707.9842529296875, + "logps/rejected": -847.3580932617188, + "loss": 0.4995, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.563530921936035, + "rewards/margins": 0.9225165843963623, + "rewards/rejected": -3.4860472679138184, + "step": 818 + }, + { + "epoch": 0.5349008082292432, + "grad_norm": 51.172477176857036, + "learning_rate": 7.901917551867663e-08, + "logits/chosen": -1.5167884826660156, + "logits/rejected": -1.4933404922485352, + "logps/chosen": -764.8781127929688, + "logps/rejected": -829.58935546875, + "loss": 0.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.603954792022705, + "rewards/margins": 0.4926825761795044, + "rewards/rejected": -3.09663724899292, + "step": 819 + }, + { + "epoch": 0.5355539227692057, + "grad_norm": 30.46290960655871, + "learning_rate": 7.884830036870922e-08, + "logits/chosen": -1.4870680570602417, + "logits/rejected": -1.4628260135650635, + "logps/chosen": -821.735595703125, + "logps/rejected": -956.1165771484375, + "loss": 0.484, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0134003162384033, + "rewards/margins": 1.0060707330703735, + "rewards/rejected": -4.019470691680908, + "step": 820 + }, + { + "epoch": 0.5362070373091681, + "grad_norm": 13.505022242348229, + "learning_rate": 7.867740518783371e-08, + "logits/chosen": -1.5747795104980469, + "logits/rejected": -1.5648517608642578, + "logps/chosen": -851.3766479492188, + "logps/rejected": -903.6992797851562, + "loss": 0.5012, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9860377311706543, + "rewards/margins": 0.8135138750076294, + "rewards/rejected": -3.799551486968994, + "step": 821 + }, + { + "epoch": 0.5368601518491305, + "grad_norm": 19.48923466978227, + "learning_rate": 7.85064908655819e-08, + "logits/chosen": -1.4971075057983398, + "logits/rejected": -1.548054814338684, + "logps/chosen": -855.16748046875, + "logps/rejected": -904.4014282226562, + "loss": 0.4958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.101994514465332, + "rewards/margins": 0.8241324424743652, + "rewards/rejected": -3.926126480102539, + "step": 822 + }, + { + "epoch": 0.537513266389093, + "grad_norm": 24.801244541119452, + "learning_rate": 7.833555829158527e-08, + "logits/chosen": -1.5962388515472412, + "logits/rejected": -1.5477280616760254, + "logps/chosen": -887.2816772460938, + "logps/rejected": -913.8333129882812, + "loss": 0.4767, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1752548217773438, + "rewards/margins": 0.6703548431396484, + "rewards/rejected": -3.845609664916992, + "step": 823 + }, + { + "epoch": 0.5381663809290554, + "grad_norm": 42.45784716129404, + "learning_rate": 7.816460835557028e-08, + "logits/chosen": -1.5705194473266602, + "logits/rejected": -1.5918525457382202, + "logps/chosen": -808.6838989257812, + "logps/rejected": -973.3630981445312, + "loss": 0.4975, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.012310743331909, + "rewards/margins": 1.2067666053771973, + "rewards/rejected": -4.2190775871276855, + "step": 824 + }, + { + "epoch": 0.5388194954690179, + "grad_norm": 18.2583142847124, + "learning_rate": 7.799364194735377e-08, + "logits/chosen": -1.5670945644378662, + "logits/rejected": -1.5296969413757324, + "logps/chosen": -861.2294311523438, + "logps/rejected": -866.001953125, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.31349515914917, + "rewards/margins": 0.387903094291687, + "rewards/rejected": -3.7013983726501465, + "step": 825 + }, + { + "epoch": 0.5394726100089803, + "grad_norm": 14.894630917984315, + "learning_rate": 7.782265995683828e-08, + "logits/chosen": -1.539316177368164, + "logits/rejected": -1.5315918922424316, + "logps/chosen": -808.724609375, + "logps/rejected": -842.3665771484375, + "loss": 0.4964, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.908222198486328, + "rewards/margins": 0.6761490106582642, + "rewards/rejected": -3.584371328353882, + "step": 826 + }, + { + "epoch": 0.5401257245489428, + "grad_norm": 46.701075509344925, + "learning_rate": 7.765166327400754e-08, + "logits/chosen": -1.5789310932159424, + "logits/rejected": -1.552915334701538, + "logps/chosen": -835.5314331054688, + "logps/rejected": -890.4103393554688, + "loss": 0.4631, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1613926887512207, + "rewards/margins": 0.716499924659729, + "rewards/rejected": -3.8778927326202393, + "step": 827 + }, + { + "epoch": 0.5407788390889052, + "grad_norm": 23.68366708316572, + "learning_rate": 7.748065278892171e-08, + "logits/chosen": -1.5369728803634644, + "logits/rejected": -1.5010067224502563, + "logps/chosen": -812.9715576171875, + "logps/rejected": -846.779052734375, + "loss": 0.5664, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.117812156677246, + "rewards/margins": 0.5145601630210876, + "rewards/rejected": -3.6323723793029785, + "step": 828 + }, + { + "epoch": 0.5414319536288676, + "grad_norm": 9.80288754353676, + "learning_rate": 7.730962939171278e-08, + "logits/chosen": -1.5014228820800781, + "logits/rejected": -1.503250241279602, + "logps/chosen": -921.4869384765625, + "logps/rejected": -941.43896484375, + "loss": 0.4775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4559803009033203, + "rewards/margins": 0.8563462495803833, + "rewards/rejected": -4.312326431274414, + "step": 829 + }, + { + "epoch": 0.5420850681688301, + "grad_norm": 23.08260983849523, + "learning_rate": 7.713859397257995e-08, + "logits/chosen": -1.5203160047531128, + "logits/rejected": -1.4632571935653687, + "logps/chosen": -813.74267578125, + "logps/rejected": -845.8812255859375, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1719284057617188, + "rewards/margins": 0.44786882400512695, + "rewards/rejected": -3.6197972297668457, + "step": 830 + }, + { + "epoch": 0.5427381827087926, + "grad_norm": 86.00587707569419, + "learning_rate": 7.696754742178503e-08, + "logits/chosen": -1.5021620988845825, + "logits/rejected": -1.5242396593093872, + "logps/chosen": -817.4739990234375, + "logps/rejected": -865.0052490234375, + "loss": 0.5095, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8962783813476562, + "rewards/margins": 0.689846932888031, + "rewards/rejected": -3.586124897003174, + "step": 831 + }, + { + "epoch": 0.543391297248755, + "grad_norm": 52.62733320349321, + "learning_rate": 7.679649062964774e-08, + "logits/chosen": -1.567655324935913, + "logits/rejected": -1.5549966096878052, + "logps/chosen": -735.2625732421875, + "logps/rejected": -834.2606811523438, + "loss": 0.459, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9052071571350098, + "rewards/margins": 0.9503459930419922, + "rewards/rejected": -3.85555362701416, + "step": 832 + }, + { + "epoch": 0.5440444117887174, + "grad_norm": 34.035780541177466, + "learning_rate": 7.662542448654109e-08, + "logits/chosen": -1.5708342790603638, + "logits/rejected": -1.511175274848938, + "logps/chosen": -821.263671875, + "logps/rejected": -975.2786254882812, + "loss": 0.5008, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.831267833709717, + "rewards/margins": 1.1821346282958984, + "rewards/rejected": -4.013402462005615, + "step": 833 + }, + { + "epoch": 0.5446975263286798, + "grad_norm": 12.816837148569553, + "learning_rate": 7.645434988288683e-08, + "logits/chosen": -1.5968691110610962, + "logits/rejected": -1.5883498191833496, + "logps/chosen": -778.30712890625, + "logps/rejected": -840.7581176757812, + "loss": 0.5001, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8528571128845215, + "rewards/margins": 0.7486177086830139, + "rewards/rejected": -3.6014747619628906, + "step": 834 + }, + { + "epoch": 0.5453506408686424, + "grad_norm": 22.221139309815047, + "learning_rate": 7.628326770915069e-08, + "logits/chosen": -1.5044746398925781, + "logits/rejected": -1.4766840934753418, + "logps/chosen": -839.3434448242188, + "logps/rejected": -888.7236328125, + "loss": 0.4587, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.229734182357788, + "rewards/margins": 0.8066515922546387, + "rewards/rejected": -4.036386013031006, + "step": 835 + }, + { + "epoch": 0.5460037554086048, + "grad_norm": 21.464697851921866, + "learning_rate": 7.611217885583783e-08, + "logits/chosen": -1.6017417907714844, + "logits/rejected": -1.6247013807296753, + "logps/chosen": -887.736328125, + "logps/rejected": -987.9991455078125, + "loss": 0.5011, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1084957122802734, + "rewards/margins": 0.854011595249176, + "rewards/rejected": -3.9625072479248047, + "step": 836 + }, + { + "epoch": 0.5466568699485672, + "grad_norm": 39.79039092333048, + "learning_rate": 7.594108421348816e-08, + "logits/chosen": -1.4360780715942383, + "logits/rejected": -1.4754202365875244, + "logps/chosen": -759.8140258789062, + "logps/rejected": -813.8535766601562, + "loss": 0.5061, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8158435821533203, + "rewards/margins": 0.8641850352287292, + "rewards/rejected": -3.6800286769866943, + "step": 837 + }, + { + "epoch": 0.5473099844885296, + "grad_norm": 55.549125631484166, + "learning_rate": 7.576998467267174e-08, + "logits/chosen": -1.56089448928833, + "logits/rejected": -1.581824541091919, + "logps/chosen": -909.1053466796875, + "logps/rejected": -948.4317016601562, + "loss": 0.5306, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.213887929916382, + "rewards/margins": 0.46870189905166626, + "rewards/rejected": -3.6825897693634033, + "step": 838 + }, + { + "epoch": 0.5479630990284922, + "grad_norm": 55.14261568208239, + "learning_rate": 7.559888112398411e-08, + "logits/chosen": -1.4533392190933228, + "logits/rejected": -1.4627681970596313, + "logps/chosen": -726.8400268554688, + "logps/rejected": -814.5458374023438, + "loss": 0.5163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.764704942703247, + "rewards/margins": 0.9564896821975708, + "rewards/rejected": -3.7211947441101074, + "step": 839 + }, + { + "epoch": 0.5486162135684546, + "grad_norm": 30.505010236460816, + "learning_rate": 7.542777445804171e-08, + "logits/chosen": -1.3663051128387451, + "logits/rejected": -1.3810725212097168, + "logps/chosen": -795.670654296875, + "logps/rejected": -962.4092407226562, + "loss": 0.4755, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0239500999450684, + "rewards/margins": 1.2967259883880615, + "rewards/rejected": -4.320675849914551, + "step": 840 + }, + { + "epoch": 0.549269328108417, + "grad_norm": 14.300839480022828, + "learning_rate": 7.525666556547714e-08, + "logits/chosen": -1.6288928985595703, + "logits/rejected": -1.5527688264846802, + "logps/chosen": -799.6822509765625, + "logps/rejected": -940.3196411132812, + "loss": 0.494, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9739322662353516, + "rewards/margins": 0.9197661876678467, + "rewards/rejected": -3.893698215484619, + "step": 841 + }, + { + "epoch": 0.5499224426483794, + "grad_norm": 88.7947159228681, + "learning_rate": 7.508555533693462e-08, + "logits/chosen": -1.4860543012619019, + "logits/rejected": -1.4642175436019897, + "logps/chosen": -732.2152099609375, + "logps/rejected": -754.6222534179688, + "loss": 0.4724, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6171233654022217, + "rewards/margins": 0.7891316413879395, + "rewards/rejected": -3.4062552452087402, + "step": 842 + }, + { + "epoch": 0.550575557188342, + "grad_norm": 69.35311679506285, + "learning_rate": 7.49144446630654e-08, + "logits/chosen": -1.4915255308151245, + "logits/rejected": -1.4560573101043701, + "logps/chosen": -874.8602294921875, + "logps/rejected": -909.4222412109375, + "loss": 0.5181, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3296079635620117, + "rewards/margins": 0.6430115699768066, + "rewards/rejected": -3.9726200103759766, + "step": 843 + }, + { + "epoch": 0.5512286717283044, + "grad_norm": 14.122314075620803, + "learning_rate": 7.474333443452289e-08, + "logits/chosen": -1.5023688077926636, + "logits/rejected": -1.4814002513885498, + "logps/chosen": -812.0813598632812, + "logps/rejected": -892.1846923828125, + "loss": 0.4773, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.119580030441284, + "rewards/margins": 1.089004397392273, + "rewards/rejected": -4.208584785461426, + "step": 844 + }, + { + "epoch": 0.5518817862682668, + "grad_norm": 22.523095660476606, + "learning_rate": 7.45722255419583e-08, + "logits/chosen": -1.598945140838623, + "logits/rejected": -1.5560745000839233, + "logps/chosen": -848.6195678710938, + "logps/rejected": -891.3744506835938, + "loss": 0.5091, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.038963556289673, + "rewards/margins": 0.6308895349502563, + "rewards/rejected": -3.6698527336120605, + "step": 845 + }, + { + "epoch": 0.5525349008082292, + "grad_norm": 37.5554038949777, + "learning_rate": 7.44011188760159e-08, + "logits/chosen": -1.593327283859253, + "logits/rejected": -1.6003532409667969, + "logps/chosen": -984.0642700195312, + "logps/rejected": -945.6554565429688, + "loss": 0.5136, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7509801387786865, + "rewards/margins": 0.44811174273490906, + "rewards/rejected": -4.199091911315918, + "step": 846 + }, + { + "epoch": 0.5531880153481917, + "grad_norm": 16.920523902644987, + "learning_rate": 7.423001532732826e-08, + "logits/chosen": -1.5089354515075684, + "logits/rejected": -1.468766212463379, + "logps/chosen": -713.7976684570312, + "logps/rejected": -817.9136962890625, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9451818466186523, + "rewards/margins": 1.023890495300293, + "rewards/rejected": -3.9690723419189453, + "step": 847 + }, + { + "epoch": 0.5538411298881541, + "grad_norm": 11.301729003307928, + "learning_rate": 7.405891578651185e-08, + "logits/chosen": -1.5803592205047607, + "logits/rejected": -1.5427913665771484, + "logps/chosen": -875.8158569335938, + "logps/rejected": -872.2986450195312, + "loss": 0.5062, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.213369369506836, + "rewards/margins": 0.4956037104129791, + "rewards/rejected": -3.708972930908203, + "step": 848 + }, + { + "epoch": 0.5544942444281166, + "grad_norm": 74.39833754806011, + "learning_rate": 7.388782114416217e-08, + "logits/chosen": -1.6001735925674438, + "logits/rejected": -1.5752531290054321, + "logps/chosen": -795.3203125, + "logps/rejected": -860.407470703125, + "loss": 0.5166, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.002932548522949, + "rewards/margins": 0.9377602934837341, + "rewards/rejected": -3.9406933784484863, + "step": 849 + }, + { + "epoch": 0.555147358968079, + "grad_norm": 19.66725027995187, + "learning_rate": 7.371673229084931e-08, + "logits/chosen": -1.5427309274673462, + "logits/rejected": -1.5140000581741333, + "logps/chosen": -799.3212890625, + "logps/rejected": -892.1961669921875, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.130077838897705, + "rewards/margins": 1.1122186183929443, + "rewards/rejected": -4.24229621887207, + "step": 850 + }, + { + "epoch": 0.5558004735080415, + "grad_norm": 40.531931496184235, + "learning_rate": 7.354565011711317e-08, + "logits/chosen": -1.4393893480300903, + "logits/rejected": -1.425200343132019, + "logps/chosen": -727.331787109375, + "logps/rejected": -858.167236328125, + "loss": 0.4825, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9096834659576416, + "rewards/margins": 1.041924238204956, + "rewards/rejected": -3.9516077041625977, + "step": 851 + }, + { + "epoch": 0.5564535880480039, + "grad_norm": 16.984478852782892, + "learning_rate": 7.33745755134589e-08, + "logits/chosen": -1.5062000751495361, + "logits/rejected": -1.53849458694458, + "logps/chosen": -802.3837890625, + "logps/rejected": -890.3169555664062, + "loss": 0.5344, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2534284591674805, + "rewards/margins": 0.8640724420547485, + "rewards/rejected": -4.1175007820129395, + "step": 852 + }, + { + "epoch": 0.5571067025879664, + "grad_norm": 35.84235839938917, + "learning_rate": 7.320350937035228e-08, + "logits/chosen": -1.5342974662780762, + "logits/rejected": -1.5023765563964844, + "logps/chosen": -835.8595581054688, + "logps/rejected": -865.0036010742188, + "loss": 0.5255, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0694611072540283, + "rewards/margins": 0.7715376615524292, + "rewards/rejected": -3.840998411178589, + "step": 853 + }, + { + "epoch": 0.5577598171279288, + "grad_norm": 10.233512310662372, + "learning_rate": 7.303245257821498e-08, + "logits/chosen": -1.517195701599121, + "logits/rejected": -1.518413782119751, + "logps/chosen": -832.9429931640625, + "logps/rejected": -876.033447265625, + "loss": 0.5145, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8127410411834717, + "rewards/margins": 0.624780535697937, + "rewards/rejected": -3.437521457672119, + "step": 854 + }, + { + "epoch": 0.5584129316678913, + "grad_norm": 26.56768549861038, + "learning_rate": 7.286140602742005e-08, + "logits/chosen": -1.4718785285949707, + "logits/rejected": -1.4336018562316895, + "logps/chosen": -897.831298828125, + "logps/rejected": -885.6130981445312, + "loss": 0.5896, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3304998874664307, + "rewards/margins": 0.5319950580596924, + "rewards/rejected": -3.862494945526123, + "step": 855 + }, + { + "epoch": 0.5590660462078537, + "grad_norm": 184.22901412620044, + "learning_rate": 7.269037060828724e-08, + "logits/chosen": -1.501589298248291, + "logits/rejected": -1.5195332765579224, + "logps/chosen": -806.921875, + "logps/rejected": -836.3479614257812, + "loss": 0.5501, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9807791709899902, + "rewards/margins": 0.5941177606582642, + "rewards/rejected": -3.574897050857544, + "step": 856 + }, + { + "epoch": 0.5597191607478161, + "grad_norm": 67.4144741062901, + "learning_rate": 7.25193472110783e-08, + "logits/chosen": -1.5734254121780396, + "logits/rejected": -1.5101865530014038, + "logps/chosen": -823.78759765625, + "logps/rejected": -862.5943603515625, + "loss": 0.4974, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.210695266723633, + "rewards/margins": 0.569321870803833, + "rewards/rejected": -3.780017375946045, + "step": 857 + }, + { + "epoch": 0.5603722752877786, + "grad_norm": 18.871171186142504, + "learning_rate": 7.234833672599245e-08, + "logits/chosen": -1.580259919166565, + "logits/rejected": -1.5304930210113525, + "logps/chosen": -937.5279541015625, + "logps/rejected": -984.5499267578125, + "loss": 0.5273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.529520034790039, + "rewards/margins": 0.9078921675682068, + "rewards/rejected": -4.437412261962891, + "step": 858 + }, + { + "epoch": 0.5610253898277411, + "grad_norm": 48.26102317175817, + "learning_rate": 7.217734004316172e-08, + "logits/chosen": -1.5722343921661377, + "logits/rejected": -1.5775549411773682, + "logps/chosen": -803.7421875, + "logps/rejected": -866.3418579101562, + "loss": 0.4969, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7525508403778076, + "rewards/margins": 0.8512840270996094, + "rewards/rejected": -3.603835105895996, + "step": 859 + }, + { + "epoch": 0.5616785043677035, + "grad_norm": 13.00017996435541, + "learning_rate": 7.200635805264625e-08, + "logits/chosen": -1.551430583000183, + "logits/rejected": -1.4977049827575684, + "logps/chosen": -795.7646484375, + "logps/rejected": -836.5863037109375, + "loss": 0.4979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.18217134475708, + "rewards/margins": 0.8377689123153687, + "rewards/rejected": -4.019940376281738, + "step": 860 + }, + { + "epoch": 0.5623316189076659, + "grad_norm": 9.231732491941768, + "learning_rate": 7.183539164442973e-08, + "logits/chosen": -1.530940055847168, + "logits/rejected": -1.5165448188781738, + "logps/chosen": -862.6856689453125, + "logps/rejected": -992.7938232421875, + "loss": 0.455, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.554490089416504, + "rewards/margins": 1.1885462999343872, + "rewards/rejected": -4.743036270141602, + "step": 861 + }, + { + "epoch": 0.5629847334476283, + "grad_norm": 62.61248101231807, + "learning_rate": 7.166444170841473e-08, + "logits/chosen": -1.4707496166229248, + "logits/rejected": -1.4739537239074707, + "logps/chosen": -821.8203125, + "logps/rejected": -850.6475219726562, + "loss": 0.5211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7928149700164795, + "rewards/margins": 0.47822028398513794, + "rewards/rejected": -3.2710354328155518, + "step": 862 + }, + { + "epoch": 0.5636378479875909, + "grad_norm": 109.02226402387393, + "learning_rate": 7.149350913441809e-08, + "logits/chosen": -1.5233734846115112, + "logits/rejected": -1.5370923280715942, + "logps/chosen": -793.0894775390625, + "logps/rejected": -808.7146606445312, + "loss": 0.534, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8652713298797607, + "rewards/margins": 0.5744865536689758, + "rewards/rejected": -3.43975830078125, + "step": 863 + }, + { + "epoch": 0.5642909625275533, + "grad_norm": 29.93004571485592, + "learning_rate": 7.132259481216628e-08, + "logits/chosen": -1.543608546257019, + "logits/rejected": -1.496211051940918, + "logps/chosen": -812.6435546875, + "logps/rejected": -853.930908203125, + "loss": 0.4421, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.108489990234375, + "rewards/margins": 0.8767549991607666, + "rewards/rejected": -3.9852447509765625, + "step": 864 + }, + { + "epoch": 0.5649440770675157, + "grad_norm": 61.12660963375552, + "learning_rate": 7.115169963129076e-08, + "logits/chosen": -1.5221493244171143, + "logits/rejected": -1.4868009090423584, + "logps/chosen": -754.1113891601562, + "logps/rejected": -894.2318115234375, + "loss": 0.475, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7326500415802, + "rewards/margins": 0.7210690975189209, + "rewards/rejected": -3.453718662261963, + "step": 865 + }, + { + "epoch": 0.5655971916074781, + "grad_norm": 17.649759346104574, + "learning_rate": 7.098082448132339e-08, + "logits/chosen": -1.4271429777145386, + "logits/rejected": -1.3975777626037598, + "logps/chosen": -795.72412109375, + "logps/rejected": -917.9866943359375, + "loss": 0.5119, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1587576866149902, + "rewards/margins": 0.951889157295227, + "rewards/rejected": -4.110646724700928, + "step": 866 + }, + { + "epoch": 0.5662503061474407, + "grad_norm": 8.92934245433921, + "learning_rate": 7.080997025169167e-08, + "logits/chosen": -1.5210411548614502, + "logits/rejected": -1.5380655527114868, + "logps/chosen": -813.1768798828125, + "logps/rejected": -933.7425537109375, + "loss": 0.4734, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.348355293273926, + "rewards/margins": 0.8711065649986267, + "rewards/rejected": -4.2194623947143555, + "step": 867 + }, + { + "epoch": 0.5669034206874031, + "grad_norm": 12.604264013248898, + "learning_rate": 7.063913783171431e-08, + "logits/chosen": -1.532928466796875, + "logits/rejected": -1.5313588380813599, + "logps/chosen": -769.3619995117188, + "logps/rejected": -937.7196044921875, + "loss": 0.4693, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7876639366149902, + "rewards/margins": 1.2222365140914917, + "rewards/rejected": -4.0099005699157715, + "step": 868 + }, + { + "epoch": 0.5675565352273655, + "grad_norm": 78.29234208780828, + "learning_rate": 7.046832811059646e-08, + "logits/chosen": -1.5154472589492798, + "logits/rejected": -1.5125453472137451, + "logps/chosen": -837.0894165039062, + "logps/rejected": -932.6472778320312, + "loss": 0.4858, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.860597848892212, + "rewards/margins": 0.9204597473144531, + "rewards/rejected": -3.781057596206665, + "step": 869 + }, + { + "epoch": 0.5682096497673279, + "grad_norm": 180.9882527133902, + "learning_rate": 7.029754197742512e-08, + "logits/chosen": -1.5261096954345703, + "logits/rejected": -1.5237348079681396, + "logps/chosen": -826.7614135742188, + "logps/rejected": -1008.6333618164062, + "loss": 0.5309, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2434701919555664, + "rewards/margins": 1.0818681716918945, + "rewards/rejected": -4.325338363647461, + "step": 870 + }, + { + "epoch": 0.5688627643072904, + "grad_norm": 40.32414479490123, + "learning_rate": 7.01267803211645e-08, + "logits/chosen": -1.5508898496627808, + "logits/rejected": -1.527504801750183, + "logps/chosen": -790.9599609375, + "logps/rejected": -838.9732666015625, + "loss": 0.5444, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0933451652526855, + "rewards/margins": 0.376648485660553, + "rewards/rejected": -3.469994306564331, + "step": 871 + }, + { + "epoch": 0.5695158788472529, + "grad_norm": 72.83509120194428, + "learning_rate": 6.995604403065144e-08, + "logits/chosen": -1.5775620937347412, + "logits/rejected": -1.5864499807357788, + "logps/chosen": -769.3580932617188, + "logps/rejected": -876.96484375, + "loss": 0.5465, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.112041473388672, + "rewards/margins": 0.8449998497962952, + "rewards/rejected": -3.957040786743164, + "step": 872 + }, + { + "epoch": 0.5701689933872153, + "grad_norm": 12.905974016118856, + "learning_rate": 6.978533399459071e-08, + "logits/chosen": -1.5712107419967651, + "logits/rejected": -1.5447235107421875, + "logps/chosen": -777.7793579101562, + "logps/rejected": -840.2425537109375, + "loss": 0.5643, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9859039783477783, + "rewards/margins": 0.5791632533073425, + "rewards/rejected": -3.5650668144226074, + "step": 873 + }, + { + "epoch": 0.5708221079271777, + "grad_norm": 21.726965969176625, + "learning_rate": 6.961465110155043e-08, + "logits/chosen": -1.512514352798462, + "logits/rejected": -1.4853906631469727, + "logps/chosen": -893.4528198242188, + "logps/rejected": -1067.001220703125, + "loss": 0.4754, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4161694049835205, + "rewards/margins": 1.114807367324829, + "rewards/rejected": -4.530977249145508, + "step": 874 + }, + { + "epoch": 0.5714752224671402, + "grad_norm": 22.280036018028355, + "learning_rate": 6.944399623995744e-08, + "logits/chosen": -1.5252811908721924, + "logits/rejected": -1.5037376880645752, + "logps/chosen": -850.0037841796875, + "logps/rejected": -850.1954345703125, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4480960369110107, + "rewards/margins": 0.41501325368881226, + "rewards/rejected": -3.8631091117858887, + "step": 875 + }, + { + "epoch": 0.5721283370071026, + "grad_norm": 40.53249032061718, + "learning_rate": 6.92733702980927e-08, + "logits/chosen": -1.5603222846984863, + "logits/rejected": -1.5633220672607422, + "logps/chosen": -817.9495849609375, + "logps/rejected": -1105.2486572265625, + "loss": 0.442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2014811038970947, + "rewards/margins": 1.719805359840393, + "rewards/rejected": -4.921286106109619, + "step": 876 + }, + { + "epoch": 0.5727814515470651, + "grad_norm": 18.002814145085477, + "learning_rate": 6.910277416408661e-08, + "logits/chosen": -1.5658931732177734, + "logits/rejected": -1.5071964263916016, + "logps/chosen": -770.361083984375, + "logps/rejected": -793.378173828125, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1433448791503906, + "rewards/margins": 0.5039821863174438, + "rewards/rejected": -3.647326946258545, + "step": 877 + }, + { + "epoch": 0.5734345660870275, + "grad_norm": 12.307347390907383, + "learning_rate": 6.89322087259144e-08, + "logits/chosen": -1.511487603187561, + "logits/rejected": -1.5128693580627441, + "logps/chosen": -846.0947875976562, + "logps/rejected": -906.3300170898438, + "loss": 0.4864, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2089366912841797, + "rewards/margins": 0.9323613047599792, + "rewards/rejected": -4.141297340393066, + "step": 878 + }, + { + "epoch": 0.57408768062699, + "grad_norm": 21.452756223686876, + "learning_rate": 6.876167487139154e-08, + "logits/chosen": -1.526938796043396, + "logits/rejected": -1.5194412469863892, + "logps/chosen": -1003.674560546875, + "logps/rejected": -1024.025390625, + "loss": 0.5129, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.6137397289276123, + "rewards/margins": 0.7001018524169922, + "rewards/rejected": -4.313840866088867, + "step": 879 + }, + { + "epoch": 0.5747407951669524, + "grad_norm": 12.942447833043328, + "learning_rate": 6.859117348816912e-08, + "logits/chosen": -1.4752848148345947, + "logits/rejected": -1.3929071426391602, + "logps/chosen": -797.0738525390625, + "logps/rejected": -978.4248046875, + "loss": 0.5142, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4416940212249756, + "rewards/margins": 1.1079531908035278, + "rewards/rejected": -4.549646854400635, + "step": 880 + }, + { + "epoch": 0.5753939097069148, + "grad_norm": 64.69859008439977, + "learning_rate": 6.842070546372922e-08, + "logits/chosen": -1.5244097709655762, + "logits/rejected": -1.486146092414856, + "logps/chosen": -969.6875, + "logps/rejected": -1005.9173583984375, + "loss": 0.6013, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9753427505493164, + "rewards/margins": 0.6842749118804932, + "rewards/rejected": -4.659617900848389, + "step": 881 + }, + { + "epoch": 0.5760470242468773, + "grad_norm": 42.79091393495759, + "learning_rate": 6.825027168538024e-08, + "logits/chosen": -1.4888124465942383, + "logits/rejected": -1.4149153232574463, + "logps/chosen": -813.0571899414062, + "logps/rejected": -825.3543701171875, + "loss": 0.574, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.368730068206787, + "rewards/margins": 0.39747297763824463, + "rewards/rejected": -3.766202926635742, + "step": 882 + }, + { + "epoch": 0.5767001387868398, + "grad_norm": 18.008747424093183, + "learning_rate": 6.807987304025236e-08, + "logits/chosen": -1.4898674488067627, + "logits/rejected": -1.4935599565505981, + "logps/chosen": -851.6963500976562, + "logps/rejected": -859.6405639648438, + "loss": 0.5219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2997894287109375, + "rewards/margins": 0.487488329410553, + "rewards/rejected": -3.787277936935425, + "step": 883 + }, + { + "epoch": 0.5773532533268022, + "grad_norm": 41.196321721092666, + "learning_rate": 6.790951041529286e-08, + "logits/chosen": -1.5880982875823975, + "logits/rejected": -1.5457425117492676, + "logps/chosen": -958.2757568359375, + "logps/rejected": -986.9423828125, + "loss": 0.522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4477083683013916, + "rewards/margins": 0.6854377388954163, + "rewards/rejected": -4.133145809173584, + "step": 884 + }, + { + "epoch": 0.5780063678667646, + "grad_norm": 30.703596533657475, + "learning_rate": 6.773918469726156e-08, + "logits/chosen": -1.480318546295166, + "logits/rejected": -1.4444735050201416, + "logps/chosen": -940.3262939453125, + "logps/rejected": -1004.0597534179688, + "loss": 0.5406, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.5821070671081543, + "rewards/margins": 0.554044246673584, + "rewards/rejected": -4.136151313781738, + "step": 885 + }, + { + "epoch": 0.578659482406727, + "grad_norm": 126.80730044989116, + "learning_rate": 6.756889677272617e-08, + "logits/chosen": -1.5217347145080566, + "logits/rejected": -1.498375415802002, + "logps/chosen": -762.6358642578125, + "logps/rejected": -902.398681640625, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.058880567550659, + "rewards/margins": 0.8846085667610168, + "rewards/rejected": -3.943488597869873, + "step": 886 + }, + { + "epoch": 0.5793125969466896, + "grad_norm": 11.803823871913366, + "learning_rate": 6.739864752805765e-08, + "logits/chosen": -1.4541584253311157, + "logits/rejected": -1.426889181137085, + "logps/chosen": -766.4666137695312, + "logps/rejected": -813.1023559570312, + "loss": 0.4918, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2732667922973633, + "rewards/margins": 0.5391014218330383, + "rewards/rejected": -3.8123679161071777, + "step": 887 + }, + { + "epoch": 0.579965711486652, + "grad_norm": 21.000885515886726, + "learning_rate": 6.722843784942565e-08, + "logits/chosen": -1.4722752571105957, + "logits/rejected": -1.467155933380127, + "logps/chosen": -837.9805908203125, + "logps/rejected": -857.227294921875, + "loss": 0.6011, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.297391653060913, + "rewards/margins": 0.3573925197124481, + "rewards/rejected": -3.6547842025756836, + "step": 888 + }, + { + "epoch": 0.5806188260266144, + "grad_norm": 17.365098193739787, + "learning_rate": 6.705826862279391e-08, + "logits/chosen": -1.4612233638763428, + "logits/rejected": -1.4755150079727173, + "logps/chosen": -873.2598876953125, + "logps/rejected": -963.6185913085938, + "loss": 0.5842, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1137642860412598, + "rewards/margins": 1.115809679031372, + "rewards/rejected": -4.229574203491211, + "step": 889 + }, + { + "epoch": 0.5812719405665768, + "grad_norm": 13.20416027815365, + "learning_rate": 6.688814073391551e-08, + "logits/chosen": -1.4795432090759277, + "logits/rejected": -1.528482437133789, + "logps/chosen": -914.3403930664062, + "logps/rejected": -992.7194213867188, + "loss": 0.5484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8117380142211914, + "rewards/margins": 0.9581300616264343, + "rewards/rejected": -4.769867897033691, + "step": 890 + }, + { + "epoch": 0.5819250551065394, + "grad_norm": 58.643014274399484, + "learning_rate": 6.671805506832844e-08, + "logits/chosen": -1.4377716779708862, + "logits/rejected": -1.4766364097595215, + "logps/chosen": -770.939697265625, + "logps/rejected": -905.95458984375, + "loss": 0.5286, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8314456939697266, + "rewards/margins": 0.9767945408821106, + "rewards/rejected": -3.8082404136657715, + "step": 891 + }, + { + "epoch": 0.5825781696465018, + "grad_norm": 55.6022723858616, + "learning_rate": 6.654801251135092e-08, + "logits/chosen": -1.6092873811721802, + "logits/rejected": -1.6193262338638306, + "logps/chosen": -859.6929321289062, + "logps/rejected": -937.2235107421875, + "loss": 0.5144, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6701273918151855, + "rewards/margins": 0.6065517067909241, + "rewards/rejected": -4.276679039001465, + "step": 892 + }, + { + "epoch": 0.5832312841864642, + "grad_norm": 21.239795373704855, + "learning_rate": 6.637801394807675e-08, + "logits/chosen": -1.465186357498169, + "logits/rejected": -1.5041096210479736, + "logps/chosen": -876.3399047851562, + "logps/rejected": -936.5634765625, + "loss": 0.479, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4567980766296387, + "rewards/margins": 0.6494026184082031, + "rewards/rejected": -4.106200695037842, + "step": 893 + }, + { + "epoch": 0.5838843987264266, + "grad_norm": 22.901871352735185, + "learning_rate": 6.620806026337073e-08, + "logits/chosen": -1.567824363708496, + "logits/rejected": -1.5391998291015625, + "logps/chosen": -831.55615234375, + "logps/rejected": -894.0813598632812, + "loss": 0.5542, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1797890663146973, + "rewards/margins": 0.6112327575683594, + "rewards/rejected": -3.7910218238830566, + "step": 894 + }, + { + "epoch": 0.5845375132663891, + "grad_norm": 18.99146292432215, + "learning_rate": 6.603815234186409e-08, + "logits/chosen": -1.5131785869598389, + "logits/rejected": -1.4995113611221313, + "logps/chosen": -811.837158203125, + "logps/rejected": -859.697265625, + "loss": 0.4643, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3386635780334473, + "rewards/margins": 1.0135873556137085, + "rewards/rejected": -4.352251052856445, + "step": 895 + }, + { + "epoch": 0.5851906278063516, + "grad_norm": 85.94411267727696, + "learning_rate": 6.586829106794986e-08, + "logits/chosen": -1.4760074615478516, + "logits/rejected": -1.5129234790802002, + "logps/chosen": -827.99560546875, + "logps/rejected": -902.9091796875, + "loss": 0.55, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.300877809524536, + "rewards/margins": 0.9479480981826782, + "rewards/rejected": -4.248825550079346, + "step": 896 + }, + { + "epoch": 0.585843742346314, + "grad_norm": 28.761272058111484, + "learning_rate": 6.569847732577822e-08, + "logits/chosen": -1.51106595993042, + "logits/rejected": -1.5085562467575073, + "logps/chosen": -797.22216796875, + "logps/rejected": -902.5498046875, + "loss": 0.4886, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1795077323913574, + "rewards/margins": 0.7169556021690369, + "rewards/rejected": -3.896463394165039, + "step": 897 + }, + { + "epoch": 0.5864968568862764, + "grad_norm": 20.15314666563717, + "learning_rate": 6.5528711999252e-08, + "logits/chosen": -1.5399476289749146, + "logits/rejected": -1.527458906173706, + "logps/chosen": -862.6876831054688, + "logps/rejected": -874.1339111328125, + "loss": 0.5053, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.810636520385742, + "rewards/margins": 0.7659615278244019, + "rewards/rejected": -3.5765976905822754, + "step": 898 + }, + { + "epoch": 0.5871499714262389, + "grad_norm": 12.274080681893953, + "learning_rate": 6.535899597202195e-08, + "logits/chosen": -1.513730525970459, + "logits/rejected": -1.484087586402893, + "logps/chosen": -859.1988525390625, + "logps/rejected": -1115.99365234375, + "loss": 0.429, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3637874126434326, + "rewards/margins": 1.6439146995544434, + "rewards/rejected": -5.007702350616455, + "step": 899 + }, + { + "epoch": 0.5878030859662013, + "grad_norm": 46.88003480260202, + "learning_rate": 6.518933012748232e-08, + "logits/chosen": -1.4574429988861084, + "logits/rejected": -1.4671128988265991, + "logps/chosen": -768.5726928710938, + "logps/rejected": -817.6867065429688, + "loss": 0.5007, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.511812210083008, + "rewards/margins": 0.7634918689727783, + "rewards/rejected": -3.2753043174743652, + "step": 900 + }, + { + "epoch": 0.5878030859662013, + "eval_logits/chosen": -1.5143874883651733, + "eval_logits/rejected": -1.500697135925293, + "eval_logps/chosen": -819.5907592773438, + "eval_logps/rejected": -894.2119750976562, + "eval_loss": 0.5088227987289429, + "eval_rewards/accuracies": 0.7540000081062317, + "eval_rewards/chosen": -3.092453956604004, + "eval_rewards/margins": 0.8272719979286194, + "eval_rewards/rejected": -3.9197258949279785, + "eval_runtime": 300.2638, + "eval_samples_per_second": 13.322, + "eval_steps_per_second": 0.833, + "step": 900 + }, + { + "epoch": 0.5884562005061638, + "grad_norm": 15.611510087556148, + "learning_rate": 6.5019715348766e-08, + "logits/chosen": -1.4778497219085693, + "logits/rejected": -1.4426562786102295, + "logps/chosen": -764.3760986328125, + "logps/rejected": -894.08740234375, + "loss": 0.4707, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.829637289047241, + "rewards/margins": 1.0723354816436768, + "rewards/rejected": -3.901973247528076, + "step": 901 + }, + { + "epoch": 0.5891093150461262, + "grad_norm": 27.04744713063808, + "learning_rate": 6.485015251874019e-08, + "logits/chosen": -1.4524825811386108, + "logits/rejected": -1.4247350692749023, + "logps/chosen": -833.126708984375, + "logps/rejected": -851.63037109375, + "loss": 0.5002, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4548346996307373, + "rewards/margins": 0.45528683066368103, + "rewards/rejected": -3.910121440887451, + "step": 902 + }, + { + "epoch": 0.5897624295860887, + "grad_norm": 14.887394055487325, + "learning_rate": 6.468064252000163e-08, + "logits/chosen": -1.4548847675323486, + "logits/rejected": -1.441070318222046, + "logps/chosen": -803.3111572265625, + "logps/rejected": -800.1891479492188, + "loss": 0.5278, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.290768623352051, + "rewards/margins": 0.5420413613319397, + "rewards/rejected": -3.8328099250793457, + "step": 903 + }, + { + "epoch": 0.5904155441260511, + "grad_norm": 88.18212760967876, + "learning_rate": 6.451118623487215e-08, + "logits/chosen": -1.4934018850326538, + "logits/rejected": -1.491977334022522, + "logps/chosen": -763.4550170898438, + "logps/rejected": -997.37255859375, + "loss": 0.5368, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9802534580230713, + "rewards/margins": 1.5265756845474243, + "rewards/rejected": -4.506828784942627, + "step": 904 + }, + { + "epoch": 0.5910686586660135, + "grad_norm": 24.118764808026647, + "learning_rate": 6.434178454539393e-08, + "logits/chosen": -1.4591268301010132, + "logits/rejected": -1.475930094718933, + "logps/chosen": -738.4482421875, + "logps/rejected": -801.5521850585938, + "loss": 0.4642, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7284841537475586, + "rewards/margins": 0.7377627491950989, + "rewards/rejected": -3.466247081756592, + "step": 905 + }, + { + "epoch": 0.591721773205976, + "grad_norm": 17.550624193334937, + "learning_rate": 6.417243833332495e-08, + "logits/chosen": -1.5250346660614014, + "logits/rejected": -1.4923107624053955, + "logps/chosen": -816.05419921875, + "logps/rejected": -943.5857543945312, + "loss": 0.5238, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.181175708770752, + "rewards/margins": 1.0177024602890015, + "rewards/rejected": -4.198878288269043, + "step": 906 + }, + { + "epoch": 0.5923748877459385, + "grad_norm": 23.27302229809888, + "learning_rate": 6.400314848013446e-08, + "logits/chosen": -1.5263398885726929, + "logits/rejected": -1.5403982400894165, + "logps/chosen": -898.93408203125, + "logps/rejected": -945.4384765625, + "loss": 0.5122, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.269822835922241, + "rewards/margins": 0.5661499500274658, + "rewards/rejected": -3.835972785949707, + "step": 907 + }, + { + "epoch": 0.5930280022859009, + "grad_norm": 56.67567956066173, + "learning_rate": 6.383391586699837e-08, + "logits/chosen": -1.5245555639266968, + "logits/rejected": -1.5307707786560059, + "logps/chosen": -879.91064453125, + "logps/rejected": -969.3151245117188, + "loss": 0.4661, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0276691913604736, + "rewards/margins": 0.8546069264411926, + "rewards/rejected": -3.8822762966156006, + "step": 908 + }, + { + "epoch": 0.5936811168258633, + "grad_norm": 18.262554233951846, + "learning_rate": 6.366474137479459e-08, + "logits/chosen": -1.5455853939056396, + "logits/rejected": -1.428063154220581, + "logps/chosen": -790.438720703125, + "logps/rejected": -823.03271484375, + "loss": 0.5309, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.124584674835205, + "rewards/margins": 0.8692245483398438, + "rewards/rejected": -3.993809223175049, + "step": 909 + }, + { + "epoch": 0.5943342313658257, + "grad_norm": 37.07411490442863, + "learning_rate": 6.349562588409858e-08, + "logits/chosen": -1.540130853652954, + "logits/rejected": -1.5190839767456055, + "logps/chosen": -887.24462890625, + "logps/rejected": -943.5260620117188, + "loss": 0.5406, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2311315536499023, + "rewards/margins": 0.5626290440559387, + "rewards/rejected": -3.7937607765197754, + "step": 910 + }, + { + "epoch": 0.5949873459057883, + "grad_norm": 81.1787368218721, + "learning_rate": 6.332657027517865e-08, + "logits/chosen": -1.531748652458191, + "logits/rejected": -1.566206932067871, + "logps/chosen": -858.2910766601562, + "logps/rejected": -1005.5297241210938, + "loss": 0.5071, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.267298698425293, + "rewards/margins": 1.0169984102249146, + "rewards/rejected": -4.284296989440918, + "step": 911 + }, + { + "epoch": 0.5956404604457507, + "grad_norm": 32.28904552141215, + "learning_rate": 6.315757542799137e-08, + "logits/chosen": -1.5486570596694946, + "logits/rejected": -1.530015230178833, + "logps/chosen": -837.4837036132812, + "logps/rejected": -862.83447265625, + "loss": 0.5273, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.196139335632324, + "rewards/margins": 0.7048149108886719, + "rewards/rejected": -3.900954246520996, + "step": 912 + }, + { + "epoch": 0.5962935749857131, + "grad_norm": 10.082427550315396, + "learning_rate": 6.29886422221771e-08, + "logits/chosen": -1.6052241325378418, + "logits/rejected": -1.5839667320251465, + "logps/chosen": -873.5725708007812, + "logps/rejected": -978.7376708984375, + "loss": 0.4844, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.3677515983581543, + "rewards/margins": 1.1688979864120483, + "rewards/rejected": -4.536649227142334, + "step": 913 + }, + { + "epoch": 0.5969466895256755, + "grad_norm": 49.67965132832666, + "learning_rate": 6.281977153705534e-08, + "logits/chosen": -1.5648092031478882, + "logits/rejected": -1.5402863025665283, + "logps/chosen": -851.8053588867188, + "logps/rejected": -900.38818359375, + "loss": 0.5503, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.964672327041626, + "rewards/margins": 0.8616347312927246, + "rewards/rejected": -3.8263068199157715, + "step": 914 + }, + { + "epoch": 0.5975998040656381, + "grad_norm": 19.092868780473093, + "learning_rate": 6.265096425162015e-08, + "logits/chosen": -1.4740824699401855, + "logits/rejected": -1.4873621463775635, + "logps/chosen": -818.6807250976562, + "logps/rejected": -898.1356201171875, + "loss": 0.5058, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7591428756713867, + "rewards/margins": 1.210620641708374, + "rewards/rejected": -3.9697635173797607, + "step": 915 + }, + { + "epoch": 0.5982529186056005, + "grad_norm": 47.5168167772122, + "learning_rate": 6.24822212445356e-08, + "logits/chosen": -1.4681650400161743, + "logits/rejected": -1.4547978639602661, + "logps/chosen": -792.4906616210938, + "logps/rejected": -850.310546875, + "loss": 0.4641, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.865018367767334, + "rewards/margins": 0.6488918662071228, + "rewards/rejected": -3.5139102935791016, + "step": 916 + }, + { + "epoch": 0.5989060331455629, + "grad_norm": 33.12140555034356, + "learning_rate": 6.231354339413116e-08, + "logits/chosen": -1.4651777744293213, + "logits/rejected": -1.473002314567566, + "logps/chosen": -830.7955932617188, + "logps/rejected": -866.82080078125, + "loss": 0.5181, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.1264560222625732, + "rewards/margins": 0.34810954332351685, + "rewards/rejected": -3.4745657444000244, + "step": 917 + }, + { + "epoch": 0.5995591476855253, + "grad_norm": 15.005549258653641, + "learning_rate": 6.214493157839716e-08, + "logits/chosen": -1.5664610862731934, + "logits/rejected": -1.541873574256897, + "logps/chosen": -868.6499633789062, + "logps/rejected": -926.1016845703125, + "loss": 0.444, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0149686336517334, + "rewards/margins": 0.7024274468421936, + "rewards/rejected": -3.7173960208892822, + "step": 918 + }, + { + "epoch": 0.6002122622254878, + "grad_norm": 31.0641454388364, + "learning_rate": 6.197638667498022e-08, + "logits/chosen": -1.5562750101089478, + "logits/rejected": -1.5133049488067627, + "logps/chosen": -773.593994140625, + "logps/rejected": -830.7118530273438, + "loss": 0.5931, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2983880043029785, + "rewards/margins": 0.45370548963546753, + "rewards/rejected": -3.7520933151245117, + "step": 919 + }, + { + "epoch": 0.6008653767654503, + "grad_norm": 20.956839284919962, + "learning_rate": 6.180790956117867e-08, + "logits/chosen": -1.467447280883789, + "logits/rejected": -1.4105029106140137, + "logps/chosen": -782.5927124023438, + "logps/rejected": -818.0179443359375, + "loss": 0.508, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.0541372299194336, + "rewards/margins": 0.4292081892490387, + "rewards/rejected": -3.4833457469940186, + "step": 920 + }, + { + "epoch": 0.6015184913054127, + "grad_norm": 14.668503955298801, + "learning_rate": 6.163950111393799e-08, + "logits/chosen": -1.5520689487457275, + "logits/rejected": -1.5361812114715576, + "logps/chosen": -864.1260375976562, + "logps/rejected": -910.3584594726562, + "loss": 0.5677, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.183807134628296, + "rewards/margins": 1.1038367748260498, + "rewards/rejected": -4.287644386291504, + "step": 921 + }, + { + "epoch": 0.6021716058453751, + "grad_norm": 27.305517777571882, + "learning_rate": 6.147116220984622e-08, + "logits/chosen": -1.4598233699798584, + "logits/rejected": -1.460257649421692, + "logps/chosen": -816.2798461914062, + "logps/rejected": -911.81591796875, + "loss": 0.4793, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3738749027252197, + "rewards/margins": 0.7180493474006653, + "rewards/rejected": -4.09192419052124, + "step": 922 + }, + { + "epoch": 0.6028247203853376, + "grad_norm": 41.739674304138276, + "learning_rate": 6.130289372512946e-08, + "logits/chosen": -1.444016933441162, + "logits/rejected": -1.474906325340271, + "logps/chosen": -754.188232421875, + "logps/rejected": -857.038818359375, + "loss": 0.5205, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8269124031066895, + "rewards/margins": 0.8509232997894287, + "rewards/rejected": -3.677835464477539, + "step": 923 + }, + { + "epoch": 0.6034778349253, + "grad_norm": 58.02110360239537, + "learning_rate": 6.113469653564719e-08, + "logits/chosen": -1.4690029621124268, + "logits/rejected": -1.4412765502929688, + "logps/chosen": -810.4243774414062, + "logps/rejected": -840.3269653320312, + "loss": 0.5251, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8800265789031982, + "rewards/margins": 0.6990075707435608, + "rewards/rejected": -3.5790340900421143, + "step": 924 + }, + { + "epoch": 0.6041309494652625, + "grad_norm": 61.758663895068544, + "learning_rate": 6.096657151688788e-08, + "logits/chosen": -1.4827535152435303, + "logits/rejected": -1.4380667209625244, + "logps/chosen": -844.2979736328125, + "logps/rejected": -990.4068603515625, + "loss": 0.5044, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.213895082473755, + "rewards/margins": 0.7906748056411743, + "rewards/rejected": -4.004570007324219, + "step": 925 + }, + { + "epoch": 0.6047840640052249, + "grad_norm": 47.91069541881108, + "learning_rate": 6.07985195439643e-08, + "logits/chosen": -1.5746287107467651, + "logits/rejected": -1.5688815116882324, + "logps/chosen": -938.1190185546875, + "logps/rejected": -984.5784912109375, + "loss": 0.4972, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6475698947906494, + "rewards/margins": 0.6271919012069702, + "rewards/rejected": -4.274761199951172, + "step": 926 + }, + { + "epoch": 0.6054371785451874, + "grad_norm": 12.621896694201023, + "learning_rate": 6.063054149160899e-08, + "logits/chosen": -1.4833399057388306, + "logits/rejected": -1.5237452983856201, + "logps/chosen": -763.331298828125, + "logps/rejected": -738.412353515625, + "loss": 0.6048, + "rewards/accuracies": 0.40625, + "rewards/chosen": -2.918760299682617, + "rewards/margins": 0.033799026161432266, + "rewards/rejected": -2.952559232711792, + "step": 927 + }, + { + "epoch": 0.6060902930851498, + "grad_norm": 48.565935333486564, + "learning_rate": 6.046263823416975e-08, + "logits/chosen": -1.5271244049072266, + "logits/rejected": -1.4947071075439453, + "logps/chosen": -845.4649047851562, + "logps/rejected": -829.05615234375, + "loss": 0.5946, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.154777765274048, + "rewards/margins": 0.36189785599708557, + "rewards/rejected": -3.5166759490966797, + "step": 928 + }, + { + "epoch": 0.6067434076251123, + "grad_norm": 201.1566716246379, + "learning_rate": 6.029481064560507e-08, + "logits/chosen": -1.5659098625183105, + "logits/rejected": -1.4598219394683838, + "logps/chosen": -745.5149536132812, + "logps/rejected": -827.2947998046875, + "loss": 0.5307, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.176211357116699, + "rewards/margins": 0.9403245449066162, + "rewards/rejected": -4.116535663604736, + "step": 929 + }, + { + "epoch": 0.6073965221650747, + "grad_norm": 77.8419741018575, + "learning_rate": 6.012705959947953e-08, + "logits/chosen": -1.5268572568893433, + "logits/rejected": -1.5410304069519043, + "logps/chosen": -791.3049926757812, + "logps/rejected": -944.24365234375, + "loss": 0.5016, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.995246410369873, + "rewards/margins": 0.96639084815979, + "rewards/rejected": -3.961637258529663, + "step": 930 + }, + { + "epoch": 0.6080496367050372, + "grad_norm": 21.356823796166157, + "learning_rate": 5.995938596895936e-08, + "logits/chosen": -1.5197898149490356, + "logits/rejected": -1.5448129177093506, + "logps/chosen": -880.7378540039062, + "logps/rejected": -929.0924682617188, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1356070041656494, + "rewards/margins": 0.8477343320846558, + "rewards/rejected": -3.9833414554595947, + "step": 931 + }, + { + "epoch": 0.6087027512449996, + "grad_norm": 78.1296697326321, + "learning_rate": 5.979179062680777e-08, + "logits/chosen": -1.5615133047103882, + "logits/rejected": -1.5305187702178955, + "logps/chosen": -805.41748046875, + "logps/rejected": -857.9087524414062, + "loss": 0.4887, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.8929922580718994, + "rewards/margins": 0.7500249743461609, + "rewards/rejected": -3.643017292022705, + "step": 932 + }, + { + "epoch": 0.609355865784962, + "grad_norm": 44.16786903754792, + "learning_rate": 5.96242744453805e-08, + "logits/chosen": -1.5862808227539062, + "logits/rejected": -1.500042200088501, + "logps/chosen": -957.4008178710938, + "logps/rejected": -1003.3927001953125, + "loss": 0.5375, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4804091453552246, + "rewards/margins": 0.9446480870246887, + "rewards/rejected": -4.425057411193848, + "step": 933 + }, + { + "epoch": 0.6100089803249245, + "grad_norm": 43.88738627099548, + "learning_rate": 5.945683829662129e-08, + "logits/chosen": -1.4918217658996582, + "logits/rejected": -1.4933407306671143, + "logps/chosen": -826.7015380859375, + "logps/rejected": -888.3599853515625, + "loss": 0.5092, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8977818489074707, + "rewards/margins": 0.904629111289978, + "rewards/rejected": -3.80241060256958, + "step": 934 + }, + { + "epoch": 0.610662094864887, + "grad_norm": 84.54075685318386, + "learning_rate": 5.928948305205719e-08, + "logits/chosen": -1.6410014629364014, + "logits/rejected": -1.5713551044464111, + "logps/chosen": -862.2045288085938, + "logps/rejected": -862.973876953125, + "loss": 0.4944, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.225311517715454, + "rewards/margins": 0.5583637356758118, + "rewards/rejected": -3.7836754322052, + "step": 935 + }, + { + "epoch": 0.6113152094048494, + "grad_norm": 49.202199653752565, + "learning_rate": 5.912220958279421e-08, + "logits/chosen": -1.525040626525879, + "logits/rejected": -1.5112783908843994, + "logps/chosen": -898.3567504882812, + "logps/rejected": -948.94091796875, + "loss": 0.4935, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4913763999938965, + "rewards/margins": 0.7729541063308716, + "rewards/rejected": -4.2643303871154785, + "step": 936 + }, + { + "epoch": 0.6119683239448118, + "grad_norm": 19.051733763857207, + "learning_rate": 5.895501875951271e-08, + "logits/chosen": -1.5405813455581665, + "logits/rejected": -1.4980101585388184, + "logps/chosen": -818.7405395507812, + "logps/rejected": -821.7682495117188, + "loss": 0.5139, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.834625482559204, + "rewards/margins": 0.37921154499053955, + "rewards/rejected": -3.213836669921875, + "step": 937 + }, + { + "epoch": 0.6126214384847742, + "grad_norm": 39.98147203055881, + "learning_rate": 5.878791145246284e-08, + "logits/chosen": -1.552310585975647, + "logits/rejected": -1.4916616678237915, + "logps/chosen": -828.3511962890625, + "logps/rejected": -897.640380859375, + "loss": 0.4085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9902727603912354, + "rewards/margins": 0.938800036907196, + "rewards/rejected": -3.929072856903076, + "step": 938 + }, + { + "epoch": 0.6132745530247368, + "grad_norm": 9.665640962792187, + "learning_rate": 5.862088853146006e-08, + "logits/chosen": -1.483540654182434, + "logits/rejected": -1.4673054218292236, + "logps/chosen": -868.4718627929688, + "logps/rejected": -904.5808715820312, + "loss": 0.4925, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.237854480743408, + "rewards/margins": 0.40152764320373535, + "rewards/rejected": -3.6393821239471436, + "step": 939 + }, + { + "epoch": 0.6139276675646992, + "grad_norm": 29.6041722046408, + "learning_rate": 5.8453950865880574e-08, + "logits/chosen": -1.5257904529571533, + "logits/rejected": -1.5109593868255615, + "logps/chosen": -839.3677368164062, + "logps/rejected": -890.6753540039062, + "loss": 0.4616, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.944354295730591, + "rewards/margins": 0.7917759418487549, + "rewards/rejected": -3.7361302375793457, + "step": 940 + }, + { + "epoch": 0.6145807821046616, + "grad_norm": 36.1361730170012, + "learning_rate": 5.82870993246568e-08, + "logits/chosen": -1.575820803642273, + "logits/rejected": -1.5428611040115356, + "logps/chosen": -984.8616943359375, + "logps/rejected": -1002.877685546875, + "loss": 0.5118, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8942298889160156, + "rewards/margins": 0.8039427995681763, + "rewards/rejected": -4.6981730461120605, + "step": 941 + }, + { + "epoch": 0.615233896644624, + "grad_norm": 65.95471806082536, + "learning_rate": 5.812033477627295e-08, + "logits/chosen": -1.5245524644851685, + "logits/rejected": -1.4878289699554443, + "logps/chosen": -797.1603393554688, + "logps/rejected": -862.60400390625, + "loss": 0.5424, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.427908420562744, + "rewards/margins": 0.5726900100708008, + "rewards/rejected": -4.000597953796387, + "step": 942 + }, + { + "epoch": 0.6158870111845864, + "grad_norm": 42.35986198202249, + "learning_rate": 5.795365808876033e-08, + "logits/chosen": -1.4567804336547852, + "logits/rejected": -1.4847595691680908, + "logps/chosen": -761.0791625976562, + "logps/rejected": -862.4808349609375, + "loss": 0.4818, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.592480421066284, + "rewards/margins": 0.8631225228309631, + "rewards/rejected": -3.4556026458740234, + "step": 943 + }, + { + "epoch": 0.616540125724549, + "grad_norm": 116.76247999305836, + "learning_rate": 5.778707012969296e-08, + "logits/chosen": -1.5003118515014648, + "logits/rejected": -1.4539164304733276, + "logps/chosen": -782.5404052734375, + "logps/rejected": -837.072509765625, + "loss": 0.5626, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.003209352493286, + "rewards/margins": 0.6795889139175415, + "rewards/rejected": -3.682798385620117, + "step": 944 + }, + { + "epoch": 0.6171932402645114, + "grad_norm": 36.38150538725848, + "learning_rate": 5.762057176618306e-08, + "logits/chosen": -1.5234191417694092, + "logits/rejected": -1.5212476253509521, + "logps/chosen": -823.0789184570312, + "logps/rejected": -1108.3194580078125, + "loss": 0.4678, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8806896209716797, + "rewards/margins": 1.4171735048294067, + "rewards/rejected": -4.297863483428955, + "step": 945 + }, + { + "epoch": 0.6178463548044738, + "grad_norm": 24.621342657442526, + "learning_rate": 5.745416386487637e-08, + "logits/chosen": -1.4999808073043823, + "logits/rejected": -1.5050296783447266, + "logps/chosen": -884.189697265625, + "logps/rejected": -1065.35546875, + "loss": 0.4709, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3289954662323, + "rewards/margins": 1.0373826026916504, + "rewards/rejected": -4.366378307342529, + "step": 946 + }, + { + "epoch": 0.6184994693444362, + "grad_norm": 39.9664210404075, + "learning_rate": 5.728784729194788e-08, + "logits/chosen": -1.6225008964538574, + "logits/rejected": -1.5957049131393433, + "logps/chosen": -797.822021484375, + "logps/rejected": -842.7942504882812, + "loss": 0.459, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.0013129711151123, + "rewards/margins": 0.8668568134307861, + "rewards/rejected": -3.8681697845458984, + "step": 947 + }, + { + "epoch": 0.6191525838843988, + "grad_norm": 18.300722710956293, + "learning_rate": 5.712162291309717e-08, + "logits/chosen": -1.4809714555740356, + "logits/rejected": -1.4627764225006104, + "logps/chosen": -780.6005859375, + "logps/rejected": -837.5048828125, + "loss": 0.4601, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.6685848236083984, + "rewards/margins": 0.947687029838562, + "rewards/rejected": -3.61627197265625, + "step": 948 + }, + { + "epoch": 0.6198056984243612, + "grad_norm": 48.042619463612205, + "learning_rate": 5.695549159354392e-08, + "logits/chosen": -1.5010271072387695, + "logits/rejected": -1.5268317461013794, + "logps/chosen": -862.3072509765625, + "logps/rejected": -902.5770263671875, + "loss": 0.5344, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.204814910888672, + "rewards/margins": 0.5735858082771301, + "rewards/rejected": -3.7784006595611572, + "step": 949 + }, + { + "epoch": 0.6204588129643236, + "grad_norm": 12.02401612140606, + "learning_rate": 5.678945419802344e-08, + "logits/chosen": -1.5606863498687744, + "logits/rejected": -1.553232192993164, + "logps/chosen": -786.1539916992188, + "logps/rejected": -862.3563842773438, + "loss": 0.4561, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9571497440338135, + "rewards/margins": 0.9905616641044617, + "rewards/rejected": -3.94771146774292, + "step": 950 + }, + { + "epoch": 0.621111927504286, + "grad_norm": 23.86580730911811, + "learning_rate": 5.662351159078216e-08, + "logits/chosen": -1.552533745765686, + "logits/rejected": -1.5205721855163574, + "logps/chosen": -958.0302124023438, + "logps/rejected": -975.6397094726562, + "loss": 0.5031, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.6955161094665527, + "rewards/margins": 0.5410246253013611, + "rewards/rejected": -4.2365403175354, + "step": 951 + }, + { + "epoch": 0.6217650420442485, + "grad_norm": 135.43034670723952, + "learning_rate": 5.645766463557309e-08, + "logits/chosen": -1.4750163555145264, + "logits/rejected": -1.4967775344848633, + "logps/chosen": -814.1315307617188, + "logps/rejected": -898.7764282226562, + "loss": 0.554, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.208841562271118, + "rewards/margins": 0.698648989200592, + "rewards/rejected": -3.9074904918670654, + "step": 952 + }, + { + "epoch": 0.622418156584211, + "grad_norm": 60.25491022191491, + "learning_rate": 5.629191419565141e-08, + "logits/chosen": -1.471923828125, + "logits/rejected": -1.5026978254318237, + "logps/chosen": -836.269775390625, + "logps/rejected": -875.132080078125, + "loss": 0.4922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.013134479522705, + "rewards/margins": 0.8672950267791748, + "rewards/rejected": -3.880429744720459, + "step": 953 + }, + { + "epoch": 0.6230712711241734, + "grad_norm": 70.1250850771653, + "learning_rate": 5.612626113376988e-08, + "logits/chosen": -1.415960431098938, + "logits/rejected": -1.4024447202682495, + "logps/chosen": -861.2704467773438, + "logps/rejected": -906.9395751953125, + "loss": 0.4583, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9992175102233887, + "rewards/margins": 1.1700170040130615, + "rewards/rejected": -4.169234752655029, + "step": 954 + }, + { + "epoch": 0.6237243856641358, + "grad_norm": 118.63943863651916, + "learning_rate": 5.596070631217441e-08, + "logits/chosen": -1.5227341651916504, + "logits/rejected": -1.5267282724380493, + "logps/chosen": -795.377197265625, + "logps/rejected": -826.0686645507812, + "loss": 0.5701, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9778614044189453, + "rewards/margins": 0.6003108024597168, + "rewards/rejected": -3.578172206878662, + "step": 955 + }, + { + "epoch": 0.6243775002040983, + "grad_norm": 17.56854621129216, + "learning_rate": 5.579525059259957e-08, + "logits/chosen": -1.5488358736038208, + "logits/rejected": -1.573460340499878, + "logps/chosen": -817.08447265625, + "logps/rejected": -863.1941528320312, + "loss": 0.4524, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.050839900970459, + "rewards/margins": 0.9858266115188599, + "rewards/rejected": -4.0366668701171875, + "step": 956 + }, + { + "epoch": 0.6250306147440607, + "grad_norm": 69.88285733489641, + "learning_rate": 5.562989483626409e-08, + "logits/chosen": -1.4820830821990967, + "logits/rejected": -1.4609520435333252, + "logps/chosen": -727.6483154296875, + "logps/rejected": -839.3212280273438, + "loss": 0.4287, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8100788593292236, + "rewards/margins": 1.259989857673645, + "rewards/rejected": -4.070068836212158, + "step": 957 + }, + { + "epoch": 0.6256837292840232, + "grad_norm": 12.018768296537614, + "learning_rate": 5.546463990386634e-08, + "logits/chosen": -1.5005813837051392, + "logits/rejected": -1.522735357284546, + "logps/chosen": -787.5521850585938, + "logps/rejected": -897.03564453125, + "loss": 0.4824, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8902053833007812, + "rewards/margins": 1.399174451828003, + "rewards/rejected": -4.289380073547363, + "step": 958 + }, + { + "epoch": 0.6263368438239856, + "grad_norm": 33.17348323808573, + "learning_rate": 5.5299486655579924e-08, + "logits/chosen": -1.530156135559082, + "logits/rejected": -1.5278276205062866, + "logps/chosen": -807.750244140625, + "logps/rejected": -950.3114013671875, + "loss": 0.4757, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.519279956817627, + "rewards/margins": 1.149764060974121, + "rewards/rejected": -4.669044494628906, + "step": 959 + }, + { + "epoch": 0.6269899583639481, + "grad_norm": 15.26418942385675, + "learning_rate": 5.513443595104917e-08, + "logits/chosen": -1.4855040311813354, + "logits/rejected": -1.495133638381958, + "logps/chosen": -800.33837890625, + "logps/rejected": -888.4577026367188, + "loss": 0.4282, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.716895580291748, + "rewards/margins": 0.8394394516944885, + "rewards/rejected": -3.556334972381592, + "step": 960 + }, + { + "epoch": 0.6276430729039105, + "grad_norm": 18.55918206412983, + "learning_rate": 5.496948864938463e-08, + "logits/chosen": -1.564420461654663, + "logits/rejected": -1.5435676574707031, + "logps/chosen": -925.506591796875, + "logps/rejected": -1076.7103271484375, + "loss": 0.4707, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.589116096496582, + "rewards/margins": 1.4336352348327637, + "rewards/rejected": -5.022751331329346, + "step": 961 + }, + { + "epoch": 0.6282961874438729, + "grad_norm": 36.21241704409419, + "learning_rate": 5.480464560915865e-08, + "logits/chosen": -1.457442045211792, + "logits/rejected": -1.466660976409912, + "logps/chosen": -880.1464233398438, + "logps/rejected": -952.699462890625, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.218412160873413, + "rewards/margins": 0.8761493563652039, + "rewards/rejected": -4.094561576843262, + "step": 962 + }, + { + "epoch": 0.6289493019838354, + "grad_norm": 59.3801136308343, + "learning_rate": 5.463990768840088e-08, + "logits/chosen": -1.5099765062332153, + "logits/rejected": -1.4892768859863281, + "logps/chosen": -755.8883056640625, + "logps/rejected": -835.78173828125, + "loss": 0.5088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6380348205566406, + "rewards/margins": 0.8317041993141174, + "rewards/rejected": -3.469738721847534, + "step": 963 + }, + { + "epoch": 0.6296024165237979, + "grad_norm": 47.301890427275495, + "learning_rate": 5.447527574459378e-08, + "logits/chosen": -1.522971272468567, + "logits/rejected": -1.533935546875, + "logps/chosen": -732.6190185546875, + "logps/rejected": -835.910888671875, + "loss": 0.4433, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6405673027038574, + "rewards/margins": 0.8309347033500671, + "rewards/rejected": -3.4715018272399902, + "step": 964 + }, + { + "epoch": 0.6302555310637603, + "grad_norm": 36.636882714332266, + "learning_rate": 5.431075063466824e-08, + "logits/chosen": -1.485040307044983, + "logits/rejected": -1.5213403701782227, + "logps/chosen": -858.669921875, + "logps/rejected": -968.9814453125, + "loss": 0.4807, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.340672254562378, + "rewards/margins": 0.8855950236320496, + "rewards/rejected": -4.226266860961914, + "step": 965 + }, + { + "epoch": 0.6309086456037227, + "grad_norm": 126.49507570203406, + "learning_rate": 5.4146333214998996e-08, + "logits/chosen": -1.5613657236099243, + "logits/rejected": -1.510394811630249, + "logps/chosen": -831.458740234375, + "logps/rejected": -846.1495971679688, + "loss": 0.5621, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.379094362258911, + "rewards/margins": 0.5405767560005188, + "rewards/rejected": -3.919670820236206, + "step": 966 + }, + { + "epoch": 0.6315617601436851, + "grad_norm": 34.41276919074804, + "learning_rate": 5.39820243414003e-08, + "logits/chosen": -1.5569851398468018, + "logits/rejected": -1.5476921796798706, + "logps/chosen": -832.99169921875, + "logps/rejected": -912.7144165039062, + "loss": 0.4608, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6668541431427, + "rewards/margins": 0.7351149916648865, + "rewards/rejected": -4.401968955993652, + "step": 967 + }, + { + "epoch": 0.6322148746836477, + "grad_norm": 16.388453341999888, + "learning_rate": 5.381782486912144e-08, + "logits/chosen": -1.5224254131317139, + "logits/rejected": -1.48786199092865, + "logps/chosen": -816.0333862304688, + "logps/rejected": -842.23486328125, + "loss": 0.5767, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6304800510406494, + "rewards/margins": 0.6139175891876221, + "rewards/rejected": -4.24439811706543, + "step": 968 + }, + { + "epoch": 0.6328679892236101, + "grad_norm": 16.605006142190543, + "learning_rate": 5.365373565284211e-08, + "logits/chosen": -1.5403863191604614, + "logits/rejected": -1.54334557056427, + "logps/chosen": -795.1408081054688, + "logps/rejected": -888.0582275390625, + "loss": 0.5094, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9061810970306396, + "rewards/margins": 0.6548399925231934, + "rewards/rejected": -3.561021089553833, + "step": 969 + }, + { + "epoch": 0.6335211037635725, + "grad_norm": 86.2008006757627, + "learning_rate": 5.348975754666825e-08, + "logits/chosen": -1.4919219017028809, + "logits/rejected": -1.508528709411621, + "logps/chosen": -781.9150390625, + "logps/rejected": -914.5775146484375, + "loss": 0.5111, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3672971725463867, + "rewards/margins": 0.6925301551818848, + "rewards/rejected": -4.0598273277282715, + "step": 970 + }, + { + "epoch": 0.6341742183035349, + "grad_norm": 108.14966269733243, + "learning_rate": 5.33258914041274e-08, + "logits/chosen": -1.6157301664352417, + "logits/rejected": -1.5612612962722778, + "logps/chosen": -840.8732299804688, + "logps/rejected": -893.8431396484375, + "loss": 0.4703, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.353294610977173, + "rewards/margins": 0.7179979681968689, + "rewards/rejected": -4.071292877197266, + "step": 971 + }, + { + "epoch": 0.6348273328434975, + "grad_norm": 13.7781550358131, + "learning_rate": 5.316213807816432e-08, + "logits/chosen": -1.5367947816848755, + "logits/rejected": -1.4901845455169678, + "logps/chosen": -795.0319213867188, + "logps/rejected": -865.484375, + "loss": 0.4664, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.197481155395508, + "rewards/margins": 0.9921532869338989, + "rewards/rejected": -4.189634323120117, + "step": 972 + }, + { + "epoch": 0.6354804473834599, + "grad_norm": 22.603804513420233, + "learning_rate": 5.299849842113656e-08, + "logits/chosen": -1.5428191423416138, + "logits/rejected": -1.565401554107666, + "logps/chosen": -923.1882934570312, + "logps/rejected": -956.2698974609375, + "loss": 0.4838, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5026655197143555, + "rewards/margins": 0.8109586238861084, + "rewards/rejected": -4.313623905181885, + "step": 973 + }, + { + "epoch": 0.6361335619234223, + "grad_norm": 126.16238690733957, + "learning_rate": 5.283497328480998e-08, + "logits/chosen": -1.6220266819000244, + "logits/rejected": -1.613905906677246, + "logps/chosen": -949.4186401367188, + "logps/rejected": -993.9503784179688, + "loss": 0.5042, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.326920986175537, + "rewards/margins": 0.878463864326477, + "rewards/rejected": -4.205385208129883, + "step": 974 + }, + { + "epoch": 0.6367866764633847, + "grad_norm": 65.27147183382468, + "learning_rate": 5.267156352035437e-08, + "logits/chosen": -1.5607855319976807, + "logits/rejected": -1.5158913135528564, + "logps/chosen": -880.0157470703125, + "logps/rejected": -966.0748291015625, + "loss": 0.4936, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.369838237762451, + "rewards/margins": 0.9538223147392273, + "rewards/rejected": -4.323660850524902, + "step": 975 + }, + { + "epoch": 0.6374397910033472, + "grad_norm": 22.36577561540512, + "learning_rate": 5.250826997833899e-08, + "logits/chosen": -1.5453389883041382, + "logits/rejected": -1.540825366973877, + "logps/chosen": -879.3869018554688, + "logps/rejected": -885.604248046875, + "loss": 0.5292, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.171088218688965, + "rewards/margins": 0.3536863625049591, + "rewards/rejected": -3.5247745513916016, + "step": 976 + }, + { + "epoch": 0.6380929055433097, + "grad_norm": 10.578277029115368, + "learning_rate": 5.234509350872813e-08, + "logits/chosen": -1.5381667613983154, + "logits/rejected": -1.5094205141067505, + "logps/chosen": -812.7030639648438, + "logps/rejected": -927.2329711914062, + "loss": 0.5076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.316643476486206, + "rewards/margins": 1.0832691192626953, + "rewards/rejected": -4.399912357330322, + "step": 977 + }, + { + "epoch": 0.6387460200832721, + "grad_norm": 117.19097325210925, + "learning_rate": 5.218203496087671e-08, + "logits/chosen": -1.555480718612671, + "logits/rejected": -1.554026484489441, + "logps/chosen": -885.7477416992188, + "logps/rejected": -906.7445068359375, + "loss": 0.6101, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1535983085632324, + "rewards/margins": 0.5752413272857666, + "rewards/rejected": -3.72883939743042, + "step": 978 + }, + { + "epoch": 0.6393991346232345, + "grad_norm": 43.06647390428903, + "learning_rate": 5.2019095183525886e-08, + "logits/chosen": -1.5017975568771362, + "logits/rejected": -1.5113730430603027, + "logps/chosen": -754.3181762695312, + "logps/rejected": -775.8157958984375, + "loss": 0.5283, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1152279376983643, + "rewards/margins": 0.6389954090118408, + "rewards/rejected": -3.754223346710205, + "step": 979 + }, + { + "epoch": 0.640052249163197, + "grad_norm": 41.64410725373992, + "learning_rate": 5.185627502479857e-08, + "logits/chosen": -1.5162409543991089, + "logits/rejected": -1.5376276969909668, + "logps/chosen": -834.6458740234375, + "logps/rejected": -990.2511596679688, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.955130100250244, + "rewards/margins": 1.1836378574371338, + "rewards/rejected": -4.138767719268799, + "step": 980 + }, + { + "epoch": 0.6407053637031594, + "grad_norm": 58.564011363723154, + "learning_rate": 5.1693575332195006e-08, + "logits/chosen": -1.4699060916900635, + "logits/rejected": -1.4639524221420288, + "logps/chosen": -755.951416015625, + "logps/rejected": -867.30126953125, + "loss": 0.5015, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9639952182769775, + "rewards/margins": 1.0040624141693115, + "rewards/rejected": -3.968057632446289, + "step": 981 + }, + { + "epoch": 0.6413584782431219, + "grad_norm": 87.46285821272616, + "learning_rate": 5.153099695258843e-08, + "logits/chosen": -1.4716684818267822, + "logits/rejected": -1.5059077739715576, + "logps/chosen": -923.4231567382812, + "logps/rejected": -1027.0228271484375, + "loss": 0.4797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.596079111099243, + "rewards/margins": 1.1137962341308594, + "rewards/rejected": -4.709875583648682, + "step": 982 + }, + { + "epoch": 0.6420115927830843, + "grad_norm": 34.65500498760471, + "learning_rate": 5.1368540732220656e-08, + "logits/chosen": -1.5052711963653564, + "logits/rejected": -1.525931477546692, + "logps/chosen": -801.951171875, + "logps/rejected": -935.139404296875, + "loss": 0.5685, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9107887744903564, + "rewards/margins": 1.193021535873413, + "rewards/rejected": -4.1038103103637695, + "step": 983 + }, + { + "epoch": 0.6426647073230468, + "grad_norm": 15.45197836175366, + "learning_rate": 5.1206207516697614e-08, + "logits/chosen": -1.5689780712127686, + "logits/rejected": -1.4814107418060303, + "logps/chosen": -876.8812255859375, + "logps/rejected": -1007.7050170898438, + "loss": 0.5014, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.607487678527832, + "rewards/margins": 1.0582144260406494, + "rewards/rejected": -4.665701866149902, + "step": 984 + }, + { + "epoch": 0.6433178218630092, + "grad_norm": 29.42430407328828, + "learning_rate": 5.104399815098496e-08, + "logits/chosen": -1.5423859357833862, + "logits/rejected": -1.5497791767120361, + "logps/chosen": -816.120849609375, + "logps/rejected": -943.1589965820312, + "loss": 0.5109, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1182186603546143, + "rewards/margins": 0.9232460856437683, + "rewards/rejected": -4.041464805603027, + "step": 985 + }, + { + "epoch": 0.6439709364029716, + "grad_norm": 18.815807983433164, + "learning_rate": 5.088191347940375e-08, + "logits/chosen": -1.51715087890625, + "logits/rejected": -1.5343945026397705, + "logps/chosen": -813.7894287109375, + "logps/rejected": -986.6640014648438, + "loss": 0.413, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2437820434570312, + "rewards/margins": 1.1290417909622192, + "rewards/rejected": -4.372823238372803, + "step": 986 + }, + { + "epoch": 0.6446240509429341, + "grad_norm": 25.895229217854762, + "learning_rate": 5.071995434562592e-08, + "logits/chosen": -1.4702000617980957, + "logits/rejected": -1.4505935907363892, + "logps/chosen": -854.226318359375, + "logps/rejected": -964.643310546875, + "loss": 0.4751, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3152987957000732, + "rewards/margins": 0.7256244421005249, + "rewards/rejected": -4.040923595428467, + "step": 987 + }, + { + "epoch": 0.6452771654828966, + "grad_norm": 33.724194436827155, + "learning_rate": 5.055812159267003e-08, + "logits/chosen": -1.5073872804641724, + "logits/rejected": -1.4952834844589233, + "logps/chosen": -855.9230346679688, + "logps/rejected": -1033.7293701171875, + "loss": 0.4451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2582755088806152, + "rewards/margins": 1.6962679624557495, + "rewards/rejected": -4.9545440673828125, + "step": 988 + }, + { + "epoch": 0.645930280022859, + "grad_norm": 11.66573176765737, + "learning_rate": 5.0396416062896766e-08, + "logits/chosen": -1.579730749130249, + "logits/rejected": -1.5733931064605713, + "logps/chosen": -906.1632080078125, + "logps/rejected": -925.17626953125, + "loss": 0.5891, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.705543041229248, + "rewards/margins": 0.6829342246055603, + "rewards/rejected": -4.388477325439453, + "step": 989 + }, + { + "epoch": 0.6465833945628214, + "grad_norm": 150.79764751923673, + "learning_rate": 5.023483859800463e-08, + "logits/chosen": -1.5521610975265503, + "logits/rejected": -1.5487536191940308, + "logps/chosen": -818.4269409179688, + "logps/rejected": -842.7174682617188, + "loss": 0.5022, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1259286403656006, + "rewards/margins": 0.5289927124977112, + "rewards/rejected": -3.654921293258667, + "step": 990 + }, + { + "epoch": 0.6472365091027839, + "grad_norm": 16.00702278017296, + "learning_rate": 5.0073390039025534e-08, + "logits/chosen": -1.443716049194336, + "logits/rejected": -1.415492057800293, + "logps/chosen": -841.180419921875, + "logps/rejected": -963.9603271484375, + "loss": 0.5006, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2827086448669434, + "rewards/margins": 0.9521893262863159, + "rewards/rejected": -4.234897613525391, + "step": 991 + }, + { + "epoch": 0.6478896236427464, + "grad_norm": 65.24190893051149, + "learning_rate": 4.991207122632035e-08, + "logits/chosen": -1.5591700077056885, + "logits/rejected": -1.5191017389297485, + "logps/chosen": -942.4129638671875, + "logps/rejected": -1009.4859619140625, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8302550315856934, + "rewards/margins": 0.7394087314605713, + "rewards/rejected": -4.5696635246276855, + "step": 992 + }, + { + "epoch": 0.6485427381827088, + "grad_norm": 41.277206672220316, + "learning_rate": 4.975088299957471e-08, + "logits/chosen": -1.557931661605835, + "logits/rejected": -1.555905818939209, + "logps/chosen": -836.3804931640625, + "logps/rejected": -922.718994140625, + "loss": 0.4929, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.1140992641448975, + "rewards/margins": 1.1032991409301758, + "rewards/rejected": -4.217398643493652, + "step": 993 + }, + { + "epoch": 0.6491958527226712, + "grad_norm": 14.725778545114895, + "learning_rate": 4.958982619779442e-08, + "logits/chosen": -1.5064477920532227, + "logits/rejected": -1.521809697151184, + "logps/chosen": -736.304931640625, + "logps/rejected": -813.41650390625, + "loss": 0.4734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9339964389801025, + "rewards/margins": 0.7689220309257507, + "rewards/rejected": -3.702918291091919, + "step": 994 + }, + { + "epoch": 0.6498489672626336, + "grad_norm": 52.91945215629695, + "learning_rate": 4.942890165930129e-08, + "logits/chosen": -1.5186612606048584, + "logits/rejected": -1.5100464820861816, + "logps/chosen": -754.244140625, + "logps/rejected": -958.1654052734375, + "loss": 0.4943, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.929853916168213, + "rewards/margins": 1.4619183540344238, + "rewards/rejected": -4.391772270202637, + "step": 995 + }, + { + "epoch": 0.6505020818025962, + "grad_norm": 61.7741247551618, + "learning_rate": 4.926811022172866e-08, + "logits/chosen": -1.5684045553207397, + "logits/rejected": -1.464762568473816, + "logps/chosen": -862.2124633789062, + "logps/rejected": -940.2279052734375, + "loss": 0.4754, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.588654041290283, + "rewards/margins": 1.0604631900787354, + "rewards/rejected": -4.649117469787598, + "step": 996 + }, + { + "epoch": 0.6511551963425586, + "grad_norm": 85.06663516145058, + "learning_rate": 4.9107452722017015e-08, + "logits/chosen": -1.4936169385910034, + "logits/rejected": -1.497806429862976, + "logps/chosen": -827.88720703125, + "logps/rejected": -843.3629150390625, + "loss": 0.5419, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.254192352294922, + "rewards/margins": 0.3634593188762665, + "rewards/rejected": -3.6176517009735107, + "step": 997 + }, + { + "epoch": 0.651808310882521, + "grad_norm": 26.951955302254902, + "learning_rate": 4.894692999640973e-08, + "logits/chosen": -1.5051591396331787, + "logits/rejected": -1.4692599773406982, + "logps/chosen": -797.8718872070312, + "logps/rejected": -915.5535888671875, + "loss": 0.5042, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2658886909484863, + "rewards/margins": 0.9658613801002502, + "rewards/rejected": -4.231750011444092, + "step": 998 + }, + { + "epoch": 0.6524614254224834, + "grad_norm": 55.7608063216077, + "learning_rate": 4.8786542880448653e-08, + "logits/chosen": -1.537933349609375, + "logits/rejected": -1.4756388664245605, + "logps/chosen": -817.772705078125, + "logps/rejected": -852.2359008789062, + "loss": 0.5176, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.0438647270202637, + "rewards/margins": 0.5471150875091553, + "rewards/rejected": -3.590980052947998, + "step": 999 + }, + { + "epoch": 0.653114539962446, + "grad_norm": 25.848553185603418, + "learning_rate": 4.8626292208969733e-08, + "logits/chosen": -1.553949236869812, + "logits/rejected": -1.5505160093307495, + "logps/chosen": -906.37109375, + "logps/rejected": -967.929931640625, + "loss": 0.485, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5798516273498535, + "rewards/margins": 0.8147382140159607, + "rewards/rejected": -4.394589900970459, + "step": 1000 + }, + { + "epoch": 0.653114539962446, + "eval_logits/chosen": -1.4997057914733887, + "eval_logits/rejected": -1.483424186706543, + "eval_logps/chosen": -823.3939819335938, + "eval_logps/rejected": -900.8679809570312, + "eval_loss": 0.5032714605331421, + "eval_rewards/accuracies": 0.7630000114440918, + "eval_rewards/chosen": -3.1304855346679688, + "eval_rewards/margins": 0.8558005094528198, + "eval_rewards/rejected": -3.986285924911499, + "eval_runtime": 296.6594, + "eval_samples_per_second": 13.483, + "eval_steps_per_second": 0.843, + "step": 1000 + }, + { + "epoch": 0.6537676545024084, + "grad_norm": 30.491811094126753, + "learning_rate": 4.846617881609876e-08, + "logits/chosen": -1.520873785018921, + "logits/rejected": -1.4895522594451904, + "logps/chosen": -772.0455322265625, + "logps/rejected": -870.1766357421875, + "loss": 0.4863, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9721617698669434, + "rewards/margins": 0.8731052875518799, + "rewards/rejected": -3.845266819000244, + "step": 1001 + }, + { + "epoch": 0.6544207690423708, + "grad_norm": 40.22401127775807, + "learning_rate": 4.8306203535246946e-08, + "logits/chosen": -1.5727264881134033, + "logits/rejected": -1.464941143989563, + "logps/chosen": -808.9293212890625, + "logps/rejected": -871.8182373046875, + "loss": 0.5355, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1198384761810303, + "rewards/margins": 0.9884458780288696, + "rewards/rejected": -4.108283996582031, + "step": 1002 + }, + { + "epoch": 0.6550738835823332, + "grad_norm": 11.806744900302544, + "learning_rate": 4.814636719910657e-08, + "logits/chosen": -1.5598303079605103, + "logits/rejected": -1.474313735961914, + "logps/chosen": -862.10400390625, + "logps/rejected": -857.1552124023438, + "loss": 0.546, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.641342878341675, + "rewards/margins": 0.5948621034622192, + "rewards/rejected": -4.236205101013184, + "step": 1003 + }, + { + "epoch": 0.6557269981222957, + "grad_norm": 15.051255942076827, + "learning_rate": 4.798667063964673e-08, + "logits/chosen": -1.5384275913238525, + "logits/rejected": -1.494019865989685, + "logps/chosen": -754.1705932617188, + "logps/rejected": -797.1336059570312, + "loss": 0.472, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.307189702987671, + "rewards/margins": 0.5865740180015564, + "rewards/rejected": -3.893763542175293, + "step": 1004 + }, + { + "epoch": 0.6563801126622582, + "grad_norm": 12.800756135654732, + "learning_rate": 4.7827114688108985e-08, + "logits/chosen": -1.478043556213379, + "logits/rejected": -1.4925929307937622, + "logps/chosen": -698.2744140625, + "logps/rejected": -822.2246704101562, + "loss": 0.5131, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6346049308776855, + "rewards/margins": 0.8852910399436951, + "rewards/rejected": -3.5198960304260254, + "step": 1005 + }, + { + "epoch": 0.6570332272022206, + "grad_norm": 16.667891699953692, + "learning_rate": 4.7667700175002986e-08, + "logits/chosen": -1.4476077556610107, + "logits/rejected": -1.460363745689392, + "logps/chosen": -839.6358032226562, + "logps/rejected": -914.096923828125, + "loss": 0.4806, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.174222469329834, + "rewards/margins": 1.0781100988388062, + "rewards/rejected": -4.2523322105407715, + "step": 1006 + }, + { + "epoch": 0.657686341742183, + "grad_norm": 25.715571290386414, + "learning_rate": 4.750842793010217e-08, + "logits/chosen": -1.5329231023788452, + "logits/rejected": -1.5252712965011597, + "logps/chosen": -878.42041015625, + "logps/rejected": -985.8428955078125, + "loss": 0.5129, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.7149271965026855, + "rewards/margins": 0.733319878578186, + "rewards/rejected": -4.448246955871582, + "step": 1007 + }, + { + "epoch": 0.6583394562821455, + "grad_norm": 104.75961567327492, + "learning_rate": 4.7349298782439464e-08, + "logits/chosen": -1.488255500793457, + "logits/rejected": -1.4739494323730469, + "logps/chosen": -775.230224609375, + "logps/rejected": -847.8889770507812, + "loss": 0.4897, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7119245529174805, + "rewards/margins": 0.9239524602890015, + "rewards/rejected": -3.6358766555786133, + "step": 1008 + }, + { + "epoch": 0.6589925708221079, + "grad_norm": 52.144252857671894, + "learning_rate": 4.719031356030294e-08, + "logits/chosen": -1.5857702493667603, + "logits/rejected": -1.5610796213150024, + "logps/chosen": -816.1323852539062, + "logps/rejected": -970.305419921875, + "loss": 0.5158, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0045716762542725, + "rewards/margins": 1.0432796478271484, + "rewards/rejected": -4.0478515625, + "step": 1009 + }, + { + "epoch": 0.6596456853620704, + "grad_norm": 102.68956882940834, + "learning_rate": 4.703147309123156e-08, + "logits/chosen": -1.4632647037506104, + "logits/rejected": -1.4590816497802734, + "logps/chosen": -727.7660522460938, + "logps/rejected": -776.1034545898438, + "loss": 0.5347, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.654884099960327, + "rewards/margins": 0.699772298336029, + "rewards/rejected": -3.35465669631958, + "step": 1010 + }, + { + "epoch": 0.6602987999020328, + "grad_norm": 23.05602130979561, + "learning_rate": 4.687277820201077e-08, + "logits/chosen": -1.5591952800750732, + "logits/rejected": -1.5697981119155884, + "logps/chosen": -876.64599609375, + "logps/rejected": -1013.8555297851562, + "loss": 0.4686, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.513322353363037, + "rewards/margins": 0.9770915508270264, + "rewards/rejected": -4.490414142608643, + "step": 1011 + }, + { + "epoch": 0.6609519144419953, + "grad_norm": 36.01319445097211, + "learning_rate": 4.671422971866829e-08, + "logits/chosen": -1.5157580375671387, + "logits/rejected": -1.4674278497695923, + "logps/chosen": -769.1876220703125, + "logps/rejected": -776.2877197265625, + "loss": 0.5207, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8552205562591553, + "rewards/margins": 0.6877336502075195, + "rewards/rejected": -3.542954444885254, + "step": 1012 + }, + { + "epoch": 0.6616050289819577, + "grad_norm": 88.36042423899895, + "learning_rate": 4.655582846646977e-08, + "logits/chosen": -1.4231505393981934, + "logits/rejected": -1.4381476640701294, + "logps/chosen": -826.669921875, + "logps/rejected": -876.0449829101562, + "loss": 0.5261, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.297783613204956, + "rewards/margins": 1.0941379070281982, + "rewards/rejected": -4.391921043395996, + "step": 1013 + }, + { + "epoch": 0.6622581435219201, + "grad_norm": 36.895557036458406, + "learning_rate": 4.6397575269914516e-08, + "logits/chosen": -1.5066965818405151, + "logits/rejected": -1.4606728553771973, + "logps/chosen": -728.8414306640625, + "logps/rejected": -851.2317504882812, + "loss": 0.4876, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7775468826293945, + "rewards/margins": 1.0805087089538574, + "rewards/rejected": -3.858055591583252, + "step": 1014 + }, + { + "epoch": 0.6629112580618826, + "grad_norm": 42.67356514700968, + "learning_rate": 4.6239470952731144e-08, + "logits/chosen": -1.4574358463287354, + "logits/rejected": -1.4584983587265015, + "logps/chosen": -798.5891723632812, + "logps/rejected": -916.482421875, + "loss": 0.5216, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7513608932495117, + "rewards/margins": 0.9783158898353577, + "rewards/rejected": -3.7296767234802246, + "step": 1015 + }, + { + "epoch": 0.6635643726018451, + "grad_norm": 103.36971446244533, + "learning_rate": 4.608151633787337e-08, + "logits/chosen": -1.4695475101470947, + "logits/rejected": -1.4544553756713867, + "logps/chosen": -803.3701171875, + "logps/rejected": -876.03369140625, + "loss": 0.4574, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9469823837280273, + "rewards/margins": 1.0371326208114624, + "rewards/rejected": -3.9841156005859375, + "step": 1016 + }, + { + "epoch": 0.6642174871418075, + "grad_norm": 53.366420843639595, + "learning_rate": 4.5923712247515675e-08, + "logits/chosen": -1.498404860496521, + "logits/rejected": -1.5131099224090576, + "logps/chosen": -852.6907958984375, + "logps/rejected": -1058.0159912109375, + "loss": 0.4487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.187804698944092, + "rewards/margins": 1.617006778717041, + "rewards/rejected": -4.804811954498291, + "step": 1017 + }, + { + "epoch": 0.6648706016817699, + "grad_norm": 37.41870236327263, + "learning_rate": 4.5766059503049046e-08, + "logits/chosen": -1.485478401184082, + "logits/rejected": -1.501274824142456, + "logps/chosen": -788.52734375, + "logps/rejected": -801.43505859375, + "loss": 0.4773, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.898345470428467, + "rewards/margins": 0.5790349245071411, + "rewards/rejected": -3.4773800373077393, + "step": 1018 + }, + { + "epoch": 0.6655237162217323, + "grad_norm": 36.10214381184078, + "learning_rate": 4.560855892507671e-08, + "logits/chosen": -1.5048822164535522, + "logits/rejected": -1.5141572952270508, + "logps/chosen": -830.2411499023438, + "logps/rejected": -920.7844848632812, + "loss": 0.5217, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3689942359924316, + "rewards/margins": 1.0679469108581543, + "rewards/rejected": -4.436941146850586, + "step": 1019 + }, + { + "epoch": 0.6661768307616949, + "grad_norm": 45.389879083458595, + "learning_rate": 4.5451211333409836e-08, + "logits/chosen": -1.4572731256484985, + "logits/rejected": -1.4573543071746826, + "logps/chosen": -800.939697265625, + "logps/rejected": -863.949951171875, + "loss": 0.5109, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.996425151824951, + "rewards/margins": 0.8323167562484741, + "rewards/rejected": -3.828742027282715, + "step": 1020 + }, + { + "epoch": 0.6668299453016573, + "grad_norm": 10.553936200597361, + "learning_rate": 4.5294017547063234e-08, + "logits/chosen": -1.5413297414779663, + "logits/rejected": -1.520641803741455, + "logps/chosen": -755.6485595703125, + "logps/rejected": -840.0621337890625, + "loss": 0.4168, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9807937145233154, + "rewards/margins": 1.0162277221679688, + "rewards/rejected": -3.9970216751098633, + "step": 1021 + }, + { + "epoch": 0.6674830598416197, + "grad_norm": 10.572170985494784, + "learning_rate": 4.513697838425122e-08, + "logits/chosen": -1.5927180051803589, + "logits/rejected": -1.6134157180786133, + "logps/chosen": -868.3167114257812, + "logps/rejected": -956.796875, + "loss": 0.5045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4438788890838623, + "rewards/margins": 1.0897140502929688, + "rewards/rejected": -4.533592700958252, + "step": 1022 + }, + { + "epoch": 0.6681361743815821, + "grad_norm": 20.81491049314488, + "learning_rate": 4.4980094662383206e-08, + "logits/chosen": -1.4901061058044434, + "logits/rejected": -1.4351348876953125, + "logps/chosen": -784.634765625, + "logps/rejected": -926.8602294921875, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.037071704864502, + "rewards/margins": 1.026695966720581, + "rewards/rejected": -4.063767433166504, + "step": 1023 + }, + { + "epoch": 0.6687892889215447, + "grad_norm": 25.45661309370082, + "learning_rate": 4.4823367198059555e-08, + "logits/chosen": -1.513559103012085, + "logits/rejected": -1.5025659799575806, + "logps/chosen": -918.1492919921875, + "logps/rejected": -963.5728759765625, + "loss": 0.4967, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.492802381515503, + "rewards/margins": 0.798168957233429, + "rewards/rejected": -4.290971279144287, + "step": 1024 + }, + { + "epoch": 0.6694424034615071, + "grad_norm": 63.951136926876586, + "learning_rate": 4.466679680706727e-08, + "logits/chosen": -1.5798838138580322, + "logits/rejected": -1.4765055179595947, + "logps/chosen": -794.0283203125, + "logps/rejected": -891.6921997070312, + "loss": 0.5115, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.961698532104492, + "rewards/margins": 0.8770202994346619, + "rewards/rejected": -3.8387184143066406, + "step": 1025 + }, + { + "epoch": 0.6700955180014695, + "grad_norm": 57.70779325077905, + "learning_rate": 4.4510384304375773e-08, + "logits/chosen": -1.4599577188491821, + "logits/rejected": -1.5163075923919678, + "logps/chosen": -797.9346923828125, + "logps/rejected": -910.8660888671875, + "loss": 0.5434, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1370911598205566, + "rewards/margins": 0.9660441279411316, + "rewards/rejected": -4.103135108947754, + "step": 1026 + }, + { + "epoch": 0.6707486325414319, + "grad_norm": 160.10963139231885, + "learning_rate": 4.435413050413264e-08, + "logits/chosen": -1.5408748388290405, + "logits/rejected": -1.4838809967041016, + "logps/chosen": -880.136474609375, + "logps/rejected": -953.572509765625, + "loss": 0.496, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4785830974578857, + "rewards/margins": 0.9672165513038635, + "rewards/rejected": -4.445799350738525, + "step": 1027 + }, + { + "epoch": 0.6714017470813944, + "grad_norm": 30.04810670653747, + "learning_rate": 4.41980362196594e-08, + "logits/chosen": -1.5107932090759277, + "logits/rejected": -1.4906303882598877, + "logps/chosen": -787.1295166015625, + "logps/rejected": -931.1858520507812, + "loss": 0.5156, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.80479097366333, + "rewards/margins": 1.6828497648239136, + "rewards/rejected": -4.487640380859375, + "step": 1028 + }, + { + "epoch": 0.6720548616213569, + "grad_norm": 8.725100728079909, + "learning_rate": 4.4042102263447275e-08, + "logits/chosen": -1.4393929243087769, + "logits/rejected": -1.3799033164978027, + "logps/chosen": -751.6057739257812, + "logps/rejected": -883.99755859375, + "loss": 0.4621, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.040332317352295, + "rewards/margins": 1.2370002269744873, + "rewards/rejected": -4.277332782745361, + "step": 1029 + }, + { + "epoch": 0.6727079761613193, + "grad_norm": 10.666968801911986, + "learning_rate": 4.388632944715296e-08, + "logits/chosen": -1.474955677986145, + "logits/rejected": -1.4837840795516968, + "logps/chosen": -798.3235473632812, + "logps/rejected": -894.8926391601562, + "loss": 0.4808, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9682440757751465, + "rewards/margins": 0.7301187515258789, + "rewards/rejected": -3.6983628273010254, + "step": 1030 + }, + { + "epoch": 0.6733610907012817, + "grad_norm": 60.661967247074934, + "learning_rate": 4.37307185815944e-08, + "logits/chosen": -1.562424898147583, + "logits/rejected": -1.5712769031524658, + "logps/chosen": -810.6946411132812, + "logps/rejected": -843.1661987304688, + "loss": 0.5108, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2374746799468994, + "rewards/margins": 0.8579811453819275, + "rewards/rejected": -4.095455646514893, + "step": 1031 + }, + { + "epoch": 0.6740142052412442, + "grad_norm": 165.13796597577232, + "learning_rate": 4.3575270476746543e-08, + "logits/chosen": -1.4801855087280273, + "logits/rejected": -1.5016859769821167, + "logps/chosen": -857.4439697265625, + "logps/rejected": -974.478515625, + "loss": 0.515, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.03521990776062, + "rewards/margins": 0.830250084400177, + "rewards/rejected": -3.8654701709747314, + "step": 1032 + }, + { + "epoch": 0.6746673197812066, + "grad_norm": 97.99389725505694, + "learning_rate": 4.341998594173717e-08, + "logits/chosen": -1.4819018840789795, + "logits/rejected": -1.4765572547912598, + "logps/chosen": -779.4025268554688, + "logps/rejected": -837.5120239257812, + "loss": 0.4812, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9766929149627686, + "rewards/margins": 0.7156229019165039, + "rewards/rejected": -3.6923160552978516, + "step": 1033 + }, + { + "epoch": 0.6753204343211691, + "grad_norm": 24.808395132696045, + "learning_rate": 4.326486578484266e-08, + "logits/chosen": -1.4551912546157837, + "logits/rejected": -1.4724352359771729, + "logps/chosen": -780.5784301757812, + "logps/rejected": -889.181396484375, + "loss": 0.5369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8798131942749023, + "rewards/margins": 0.8098124265670776, + "rewards/rejected": -3.6896255016326904, + "step": 1034 + }, + { + "epoch": 0.6759735488611315, + "grad_norm": 32.75717304070085, + "learning_rate": 4.310991081348376e-08, + "logits/chosen": -1.4434568881988525, + "logits/rejected": -1.4026532173156738, + "logps/chosen": -810.0265502929688, + "logps/rejected": -864.6695556640625, + "loss": 0.5423, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1554508209228516, + "rewards/margins": 0.6393953561782837, + "rewards/rejected": -3.7948460578918457, + "step": 1035 + }, + { + "epoch": 0.676626663401094, + "grad_norm": 95.45593023373124, + "learning_rate": 4.295512183422145e-08, + "logits/chosen": -1.4506603479385376, + "logits/rejected": -1.4620624780654907, + "logps/chosen": -711.5603637695312, + "logps/rejected": -851.3104858398438, + "loss": 0.5207, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.811810255050659, + "rewards/margins": 1.0253384113311768, + "rewards/rejected": -3.837148666381836, + "step": 1036 + }, + { + "epoch": 0.6772797779410564, + "grad_norm": 13.65496722074414, + "learning_rate": 4.280049965275261e-08, + "logits/chosen": -1.5078392028808594, + "logits/rejected": -1.4908549785614014, + "logps/chosen": -752.6651000976562, + "logps/rejected": -865.7678833007812, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.94309663772583, + "rewards/margins": 0.8273873329162598, + "rewards/rejected": -3.77048397064209, + "step": 1037 + }, + { + "epoch": 0.6779328924810188, + "grad_norm": 23.66625717077645, + "learning_rate": 4.2646045073906e-08, + "logits/chosen": -1.4181371927261353, + "logits/rejected": -1.4423913955688477, + "logps/chosen": -730.9456176757812, + "logps/rejected": -790.7864990234375, + "loss": 0.4546, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9021615982055664, + "rewards/margins": 0.7823746800422668, + "rewards/rejected": -3.6845362186431885, + "step": 1038 + }, + { + "epoch": 0.6785860070209813, + "grad_norm": 38.74541697204639, + "learning_rate": 4.249175890163797e-08, + "logits/chosen": -1.531599521636963, + "logits/rejected": -1.5187830924987793, + "logps/chosen": -762.074951171875, + "logps/rejected": -845.5159912109375, + "loss": 0.4991, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8555479049682617, + "rewards/margins": 0.8805040121078491, + "rewards/rejected": -3.7360520362854004, + "step": 1039 + }, + { + "epoch": 0.6792391215609438, + "grad_norm": 85.38760955194438, + "learning_rate": 4.233764193902828e-08, + "logits/chosen": -1.547985553741455, + "logits/rejected": -1.5426965951919556, + "logps/chosen": -837.4592895507812, + "logps/rejected": -898.4917602539062, + "loss": 0.4809, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8173162937164307, + "rewards/margins": 0.9812331795692444, + "rewards/rejected": -3.798549175262451, + "step": 1040 + }, + { + "epoch": 0.6798922361009062, + "grad_norm": 57.38295376577258, + "learning_rate": 4.2183694988275914e-08, + "logits/chosen": -1.5143656730651855, + "logits/rejected": -1.5319607257843018, + "logps/chosen": -795.1925048828125, + "logps/rejected": -825.0725708007812, + "loss": 0.5237, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.043870210647583, + "rewards/margins": 0.7048807144165039, + "rewards/rejected": -3.748751163482666, + "step": 1041 + }, + { + "epoch": 0.6805453506408686, + "grad_norm": 13.482382576237015, + "learning_rate": 4.2029918850694955e-08, + "logits/chosen": -1.5160479545593262, + "logits/rejected": -1.5341300964355469, + "logps/chosen": -728.4697875976562, + "logps/rejected": -833.2994384765625, + "loss": 0.516, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0110697746276855, + "rewards/margins": 0.9559275507926941, + "rewards/rejected": -3.9669976234436035, + "step": 1042 + }, + { + "epoch": 0.681198465180831, + "grad_norm": 13.93984327839821, + "learning_rate": 4.1876314326710367e-08, + "logits/chosen": -1.4439070224761963, + "logits/rejected": -1.4740190505981445, + "logps/chosen": -883.0184326171875, + "logps/rejected": -914.0108032226562, + "loss": 0.5132, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1336121559143066, + "rewards/margins": 0.8373432159423828, + "rewards/rejected": -3.9709551334381104, + "step": 1043 + }, + { + "epoch": 0.6818515797207936, + "grad_norm": 93.33586037651152, + "learning_rate": 4.172288221585383e-08, + "logits/chosen": -1.4956300258636475, + "logits/rejected": -1.4777367115020752, + "logps/chosen": -819.32958984375, + "logps/rejected": -873.03955078125, + "loss": 0.5033, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1138126850128174, + "rewards/margins": 0.7950230836868286, + "rewards/rejected": -3.9088354110717773, + "step": 1044 + }, + { + "epoch": 0.682504694260756, + "grad_norm": 24.399107508868735, + "learning_rate": 4.1569623316759634e-08, + "logits/chosen": -1.5415163040161133, + "logits/rejected": -1.5112028121948242, + "logps/chosen": -853.9760131835938, + "logps/rejected": -965.800537109375, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.227302312850952, + "rewards/margins": 0.9138199687004089, + "rewards/rejected": -4.141121864318848, + "step": 1045 + }, + { + "epoch": 0.6831578088007184, + "grad_norm": 10.737613957729454, + "learning_rate": 4.1416538427160414e-08, + "logits/chosen": -1.508976936340332, + "logits/rejected": -1.4843047857284546, + "logps/chosen": -843.1103515625, + "logps/rejected": -909.9649658203125, + "loss": 0.5126, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.028240203857422, + "rewards/margins": 0.6666500568389893, + "rewards/rejected": -3.694890022277832, + "step": 1046 + }, + { + "epoch": 0.6838109233406808, + "grad_norm": 13.15409511965096, + "learning_rate": 4.126362834388311e-08, + "logits/chosen": -1.4586668014526367, + "logits/rejected": -1.450480580329895, + "logps/chosen": -823.8426513671875, + "logps/rejected": -831.0048828125, + "loss": 0.5012, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.807426691055298, + "rewards/margins": 0.6834732890129089, + "rewards/rejected": -3.4908998012542725, + "step": 1047 + }, + { + "epoch": 0.6844640378806434, + "grad_norm": 35.5691033339692, + "learning_rate": 4.11108938628448e-08, + "logits/chosen": -1.44548499584198, + "logits/rejected": -1.465252161026001, + "logps/chosen": -912.369140625, + "logps/rejected": -958.8450927734375, + "loss": 0.4735, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.652446746826172, + "rewards/margins": 0.8420040607452393, + "rewards/rejected": -4.494450569152832, + "step": 1048 + }, + { + "epoch": 0.6851171524206058, + "grad_norm": 55.79275790082709, + "learning_rate": 4.095833577904842e-08, + "logits/chosen": -1.5095837116241455, + "logits/rejected": -1.4833836555480957, + "logps/chosen": -766.8374633789062, + "logps/rejected": -825.0868530273438, + "loss": 0.4932, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7053170204162598, + "rewards/margins": 0.8340969085693359, + "rewards/rejected": -3.5394136905670166, + "step": 1049 + }, + { + "epoch": 0.6857702669605682, + "grad_norm": 16.058246806646952, + "learning_rate": 4.0805954886578825e-08, + "logits/chosen": -1.4918146133422852, + "logits/rejected": -1.4603523015975952, + "logps/chosen": -789.8798217773438, + "logps/rejected": -847.2196655273438, + "loss": 0.5385, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.3735291957855225, + "rewards/margins": 0.4681362509727478, + "rewards/rejected": -3.841665744781494, + "step": 1050 + }, + { + "epoch": 0.6864233815005306, + "grad_norm": 25.38044398676047, + "learning_rate": 4.065375197859855e-08, + "logits/chosen": -1.5610326528549194, + "logits/rejected": -1.465526819229126, + "logps/chosen": -896.5455932617188, + "logps/rejected": -925.2203369140625, + "loss": 0.5014, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.476266622543335, + "rewards/margins": 0.7058266401290894, + "rewards/rejected": -4.182093620300293, + "step": 1051 + }, + { + "epoch": 0.6870764960404931, + "grad_norm": 27.871824279674087, + "learning_rate": 4.0501727847343706e-08, + "logits/chosen": -1.5372264385223389, + "logits/rejected": -1.521620512008667, + "logps/chosen": -966.5, + "logps/rejected": -982.9517822265625, + "loss": 0.5236, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.5799427032470703, + "rewards/margins": 0.6284856796264648, + "rewards/rejected": -4.208428382873535, + "step": 1052 + }, + { + "epoch": 0.6877296105804556, + "grad_norm": 58.973884464098276, + "learning_rate": 4.034988328411982e-08, + "logits/chosen": -1.4340426921844482, + "logits/rejected": -1.4606634378433228, + "logps/chosen": -855.3406372070312, + "logps/rejected": -920.906494140625, + "loss": 0.4464, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8592772483825684, + "rewards/margins": 0.6489996314048767, + "rewards/rejected": -3.5082767009735107, + "step": 1053 + }, + { + "epoch": 0.688382725120418, + "grad_norm": 21.96563134850709, + "learning_rate": 4.0198219079297756e-08, + "logits/chosen": -1.4492591619491577, + "logits/rejected": -1.4421489238739014, + "logps/chosen": -826.43359375, + "logps/rejected": -959.9185180664062, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1718485355377197, + "rewards/margins": 0.9998074769973755, + "rewards/rejected": -4.171655654907227, + "step": 1054 + }, + { + "epoch": 0.6890358396603804, + "grad_norm": 56.311096113880815, + "learning_rate": 4.004673602230961e-08, + "logits/chosen": -1.6020833253860474, + "logits/rejected": -1.5822880268096924, + "logps/chosen": -848.7802124023438, + "logps/rejected": -922.5611572265625, + "loss": 0.5243, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1218650341033936, + "rewards/margins": 0.7863477468490601, + "rewards/rejected": -3.908212661743164, + "step": 1055 + }, + { + "epoch": 0.6896889542003429, + "grad_norm": 17.48722643305911, + "learning_rate": 3.989543490164453e-08, + "logits/chosen": -1.4615482091903687, + "logits/rejected": -1.5211315155029297, + "logps/chosen": -854.398681640625, + "logps/rejected": -965.4825439453125, + "loss": 0.4684, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.055753231048584, + "rewards/margins": 0.7217795848846436, + "rewards/rejected": -3.7775330543518066, + "step": 1056 + }, + { + "epoch": 0.6903420687403053, + "grad_norm": 37.57075928693893, + "learning_rate": 3.974431650484468e-08, + "logits/chosen": -1.4383900165557861, + "logits/rejected": -1.3848916292190552, + "logps/chosen": -895.6553344726562, + "logps/rejected": -919.5377807617188, + "loss": 0.536, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.639094352722168, + "rewards/margins": 0.42604905366897583, + "rewards/rejected": -4.06514310836792, + "step": 1057 + }, + { + "epoch": 0.6909951832802678, + "grad_norm": 26.018606342144913, + "learning_rate": 3.95933816185012e-08, + "logits/chosen": -1.5629183053970337, + "logits/rejected": -1.528983235359192, + "logps/chosen": -896.5241088867188, + "logps/rejected": -974.3204345703125, + "loss": 0.5263, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3990674018859863, + "rewards/margins": 0.964748203754425, + "rewards/rejected": -4.363815784454346, + "step": 1058 + }, + { + "epoch": 0.6916482978202302, + "grad_norm": 137.70834817726936, + "learning_rate": 3.944263102824996e-08, + "logits/chosen": -1.4782737493515015, + "logits/rejected": -1.4820241928100586, + "logps/chosen": -844.2286987304688, + "logps/rejected": -879.0075073242188, + "loss": 0.4687, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.190678358078003, + "rewards/margins": 0.6480005979537964, + "rewards/rejected": -3.8386788368225098, + "step": 1059 + }, + { + "epoch": 0.6923014123601927, + "grad_norm": 135.63517874372047, + "learning_rate": 3.9292065518767495e-08, + "logits/chosen": -1.4735994338989258, + "logits/rejected": -1.461629033088684, + "logps/chosen": -815.2098388671875, + "logps/rejected": -904.7244873046875, + "loss": 0.5408, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.002563714981079, + "rewards/margins": 0.6686397194862366, + "rewards/rejected": -3.671203374862671, + "step": 1060 + }, + { + "epoch": 0.6929545269001551, + "grad_norm": 12.912624702236881, + "learning_rate": 3.914168587376706e-08, + "logits/chosen": -1.5482277870178223, + "logits/rejected": -1.5398210287094116, + "logps/chosen": -831.609130859375, + "logps/rejected": -957.0618286132812, + "loss": 0.494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.257383108139038, + "rewards/margins": 0.9438177347183228, + "rewards/rejected": -4.201200485229492, + "step": 1061 + }, + { + "epoch": 0.6936076414401176, + "grad_norm": 19.51174100176863, + "learning_rate": 3.899149287599442e-08, + "logits/chosen": -1.4981271028518677, + "logits/rejected": -1.442700982093811, + "logps/chosen": -779.00927734375, + "logps/rejected": -808.9547729492188, + "loss": 0.5122, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9186878204345703, + "rewards/margins": 0.47457852959632874, + "rewards/rejected": -3.3932666778564453, + "step": 1062 + }, + { + "epoch": 0.69426075598008, + "grad_norm": 71.96791759257684, + "learning_rate": 3.884148730722383e-08, + "logits/chosen": -1.5923326015472412, + "logits/rejected": -1.5967943668365479, + "logps/chosen": -838.6897583007812, + "logps/rejected": -877.890625, + "loss": 0.4842, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1096723079681396, + "rewards/margins": 0.708088219165802, + "rewards/rejected": -3.817760467529297, + "step": 1063 + }, + { + "epoch": 0.6949138705200425, + "grad_norm": 16.78816497884493, + "learning_rate": 3.8691669948253964e-08, + "logits/chosen": -1.4334865808486938, + "logits/rejected": -1.4443024396896362, + "logps/chosen": -852.86572265625, + "logps/rejected": -981.8878173828125, + "loss": 0.483, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1702041625976562, + "rewards/margins": 1.1130633354187012, + "rewards/rejected": -4.283267974853516, + "step": 1064 + }, + { + "epoch": 0.6955669850600049, + "grad_norm": 19.842008322377158, + "learning_rate": 3.85420415789038e-08, + "logits/chosen": -1.4435492753982544, + "logits/rejected": -1.480208396911621, + "logps/chosen": -812.3143920898438, + "logps/rejected": -912.236572265625, + "loss": 0.475, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2056543827056885, + "rewards/margins": 0.799709141254425, + "rewards/rejected": -4.005363464355469, + "step": 1065 + }, + { + "epoch": 0.6962200995999673, + "grad_norm": 17.570135752432474, + "learning_rate": 3.839260297800864e-08, + "logits/chosen": -1.5509750843048096, + "logits/rejected": -1.542686939239502, + "logps/chosen": -918.3096923828125, + "logps/rejected": -1013.3355102539062, + "loss": 0.4643, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.234311103820801, + "rewards/margins": 1.1861119270324707, + "rewards/rejected": -4.42042350769043, + "step": 1066 + }, + { + "epoch": 0.6968732141399298, + "grad_norm": 13.333457525865867, + "learning_rate": 3.824335492341599e-08, + "logits/chosen": -1.5408436059951782, + "logits/rejected": -1.5043909549713135, + "logps/chosen": -823.61865234375, + "logps/rejected": -986.2550048828125, + "loss": 0.4637, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8370096683502197, + "rewards/margins": 1.4054211378097534, + "rewards/rejected": -4.242431163787842, + "step": 1067 + }, + { + "epoch": 0.6975263286798923, + "grad_norm": 19.537814553814222, + "learning_rate": 3.8094298191981565e-08, + "logits/chosen": -1.4838995933532715, + "logits/rejected": -1.483154535293579, + "logps/chosen": -757.1932373046875, + "logps/rejected": -872.9207763671875, + "loss": 0.5317, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.696157217025757, + "rewards/margins": 1.4016393423080444, + "rewards/rejected": -4.09779691696167, + "step": 1068 + }, + { + "epoch": 0.6981794432198547, + "grad_norm": 9.688042362394432, + "learning_rate": 3.794543355956518e-08, + "logits/chosen": -1.4866911172866821, + "logits/rejected": -1.449821949005127, + "logps/chosen": -834.3885498046875, + "logps/rejected": -924.32470703125, + "loss": 0.4753, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.975139856338501, + "rewards/margins": 0.8105930685997009, + "rewards/rejected": -3.7857329845428467, + "step": 1069 + }, + { + "epoch": 0.6988325577598171, + "grad_norm": 13.451541283780028, + "learning_rate": 3.779676180102678e-08, + "logits/chosen": -1.5045047998428345, + "logits/rejected": -1.5032069683074951, + "logps/chosen": -714.1544189453125, + "logps/rejected": -732.0367431640625, + "loss": 0.5696, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.933642864227295, + "rewards/margins": 0.15253925323486328, + "rewards/rejected": -3.0861823558807373, + "step": 1070 + }, + { + "epoch": 0.6994856722997795, + "grad_norm": 12.44342691849441, + "learning_rate": 3.76482836902224e-08, + "logits/chosen": -1.572385311126709, + "logits/rejected": -1.5535383224487305, + "logps/chosen": -902.6881713867188, + "logps/rejected": -1089.9610595703125, + "loss": 0.4715, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.132718801498413, + "rewards/margins": 1.6019270420074463, + "rewards/rejected": -4.734645843505859, + "step": 1071 + }, + { + "epoch": 0.7001387868397421, + "grad_norm": 10.296422466002083, + "learning_rate": 3.750000000000002e-08, + "logits/chosen": -1.5108206272125244, + "logits/rejected": -1.477285623550415, + "logps/chosen": -779.607666015625, + "logps/rejected": -822.5366821289062, + "loss": 0.4943, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6698358058929443, + "rewards/margins": 0.7314268946647644, + "rewards/rejected": -3.4012625217437744, + "step": 1072 + }, + { + "epoch": 0.7007919013797045, + "grad_norm": 19.95219987159032, + "learning_rate": 3.735191150219571e-08, + "logits/chosen": -1.5256969928741455, + "logits/rejected": -1.4763067960739136, + "logps/chosen": -837.349853515625, + "logps/rejected": -901.6431274414062, + "loss": 0.5116, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.106201648712158, + "rewards/margins": 0.7756415009498596, + "rewards/rejected": -3.881843328475952, + "step": 1073 + }, + { + "epoch": 0.7014450159196669, + "grad_norm": 35.78189318471355, + "learning_rate": 3.7204018967629534e-08, + "logits/chosen": -1.4913254976272583, + "logits/rejected": -1.523918628692627, + "logps/chosen": -725.7921142578125, + "logps/rejected": -901.6088256835938, + "loss": 0.496, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.752901315689087, + "rewards/margins": 1.2373815774917603, + "rewards/rejected": -3.9902822971343994, + "step": 1074 + }, + { + "epoch": 0.7020981304596293, + "grad_norm": 67.37326737845098, + "learning_rate": 3.7056323166101525e-08, + "logits/chosen": -1.5039902925491333, + "logits/rejected": -1.489875078201294, + "logps/chosen": -836.2699584960938, + "logps/rejected": -852.8065185546875, + "loss": 0.4636, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.870565891265869, + "rewards/margins": 0.6211225986480713, + "rewards/rejected": -3.4916884899139404, + "step": 1075 + }, + { + "epoch": 0.7027512449995919, + "grad_norm": 122.82544033335364, + "learning_rate": 3.690882486638771e-08, + "logits/chosen": -1.5150420665740967, + "logits/rejected": -1.4826247692108154, + "logps/chosen": -794.5693359375, + "logps/rejected": -939.4715576171875, + "loss": 0.4775, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.9887611865997314, + "rewards/margins": 1.1564222574234009, + "rewards/rejected": -4.145183563232422, + "step": 1076 + }, + { + "epoch": 0.7034043595395543, + "grad_norm": 29.770872322440407, + "learning_rate": 3.6761524836236085e-08, + "logits/chosen": -1.4067631959915161, + "logits/rejected": -1.3834965229034424, + "logps/chosen": -879.8606567382812, + "logps/rejected": -925.6470336914062, + "loss": 0.4719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.328319787979126, + "rewards/margins": 0.6426284313201904, + "rewards/rejected": -3.9709482192993164, + "step": 1077 + }, + { + "epoch": 0.7040574740795167, + "grad_norm": 32.679662772650815, + "learning_rate": 3.6614423842362605e-08, + "logits/chosen": -1.4490833282470703, + "logits/rejected": -1.4676353931427002, + "logps/chosen": -785.7656860351562, + "logps/rejected": -959.332763671875, + "loss": 0.5489, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0860912799835205, + "rewards/margins": 0.828029453754425, + "rewards/rejected": -3.914120674133301, + "step": 1078 + }, + { + "epoch": 0.7047105886194791, + "grad_norm": 26.80045244487548, + "learning_rate": 3.646752265044725e-08, + "logits/chosen": -1.5650575160980225, + "logits/rejected": -1.5426418781280518, + "logps/chosen": -833.9342041015625, + "logps/rejected": -841.90283203125, + "loss": 0.4943, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.413437604904175, + "rewards/margins": 0.4831462800502777, + "rewards/rejected": -3.8965840339660645, + "step": 1079 + }, + { + "epoch": 0.7053637031594416, + "grad_norm": 128.50612923555713, + "learning_rate": 3.6320822025129986e-08, + "logits/chosen": -1.4550724029541016, + "logits/rejected": -1.4530410766601562, + "logps/chosen": -917.0040893554688, + "logps/rejected": -1088.79248046875, + "loss": 0.5566, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.368748903274536, + "rewards/margins": 1.5208771228790283, + "rewards/rejected": -4.889625549316406, + "step": 1080 + }, + { + "epoch": 0.706016817699404, + "grad_norm": 10.851702941973679, + "learning_rate": 3.6174322730006816e-08, + "logits/chosen": -1.5157017707824707, + "logits/rejected": -1.5042166709899902, + "logps/chosen": -775.1571044921875, + "logps/rejected": -932.432373046875, + "loss": 0.4836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8636438846588135, + "rewards/margins": 1.276392936706543, + "rewards/rejected": -4.1400370597839355, + "step": 1081 + }, + { + "epoch": 0.7066699322393665, + "grad_norm": 10.972406234728096, + "learning_rate": 3.6028025527625804e-08, + "logits/chosen": -1.513984203338623, + "logits/rejected": -1.451591968536377, + "logps/chosen": -830.9796142578125, + "logps/rejected": -924.893310546875, + "loss": 0.4536, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.566514015197754, + "rewards/margins": 0.943221390247345, + "rewards/rejected": -4.509735584259033, + "step": 1082 + }, + { + "epoch": 0.7073230467793289, + "grad_norm": 18.986648016914206, + "learning_rate": 3.588193117948301e-08, + "logits/chosen": -1.4808248281478882, + "logits/rejected": -1.5148813724517822, + "logps/chosen": -857.4286499023438, + "logps/rejected": -901.3809814453125, + "loss": 0.5308, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.144946575164795, + "rewards/margins": 0.7388260960578918, + "rewards/rejected": -3.883772611618042, + "step": 1083 + }, + { + "epoch": 0.7079761613192914, + "grad_norm": 45.59239900144737, + "learning_rate": 3.573604044601873e-08, + "logits/chosen": -1.4333078861236572, + "logits/rejected": -1.4482799768447876, + "logps/chosen": -845.7681884765625, + "logps/rejected": -967.3292846679688, + "loss": 0.4798, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3961985111236572, + "rewards/margins": 1.0223309993743896, + "rewards/rejected": -4.418529510498047, + "step": 1084 + }, + { + "epoch": 0.7086292758592538, + "grad_norm": 13.726428268406318, + "learning_rate": 3.559035408661334e-08, + "logits/chosen": -1.4743175506591797, + "logits/rejected": -1.467435598373413, + "logps/chosen": -852.9383544921875, + "logps/rejected": -932.0983276367188, + "loss": 0.4883, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2556493282318115, + "rewards/margins": 1.4109907150268555, + "rewards/rejected": -4.666640281677246, + "step": 1085 + }, + { + "epoch": 0.7092823903992163, + "grad_norm": 24.407459386863895, + "learning_rate": 3.544487285958346e-08, + "logits/chosen": -1.599566102027893, + "logits/rejected": -1.5549285411834717, + "logps/chosen": -739.4046020507812, + "logps/rejected": -724.448486328125, + "loss": 0.5617, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.502041816711426, + "rewards/margins": 0.5174904465675354, + "rewards/rejected": -3.0195322036743164, + "step": 1086 + }, + { + "epoch": 0.7099355049391787, + "grad_norm": 23.319894013252863, + "learning_rate": 3.5299597522177944e-08, + "logits/chosen": -1.571434497833252, + "logits/rejected": -1.5202834606170654, + "logps/chosen": -872.5787353515625, + "logps/rejected": -882.1563720703125, + "loss": 0.5185, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.388620138168335, + "rewards/margins": 0.7215232253074646, + "rewards/rejected": -4.110143184661865, + "step": 1087 + }, + { + "epoch": 0.7105886194791412, + "grad_norm": 83.13631229824816, + "learning_rate": 3.5154528830574e-08, + "logits/chosen": -1.5138779878616333, + "logits/rejected": -1.5081030130386353, + "logps/chosen": -740.7789916992188, + "logps/rejected": -852.57421875, + "loss": 0.4798, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.757505416870117, + "rewards/margins": 1.019026756286621, + "rewards/rejected": -3.7765321731567383, + "step": 1088 + }, + { + "epoch": 0.7112417340191036, + "grad_norm": 15.401125482802248, + "learning_rate": 3.500966753987317e-08, + "logits/chosen": -1.4526937007904053, + "logits/rejected": -1.4075860977172852, + "logps/chosen": -769.2919311523438, + "logps/rejected": -886.1649169921875, + "loss": 0.482, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.7922770977020264, + "rewards/margins": 0.9756352305412292, + "rewards/rejected": -3.7679123878479004, + "step": 1089 + }, + { + "epoch": 0.711894848559066, + "grad_norm": 22.983567681134193, + "learning_rate": 3.486501440409748e-08, + "logits/chosen": -1.5759086608886719, + "logits/rejected": -1.5538822412490845, + "logps/chosen": -872.93896484375, + "logps/rejected": -946.4339599609375, + "loss": 0.4708, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2252018451690674, + "rewards/margins": 0.8281301856040955, + "rewards/rejected": -4.0533318519592285, + "step": 1090 + }, + { + "epoch": 0.7125479630990285, + "grad_norm": 25.263230007411096, + "learning_rate": 3.472057017618547e-08, + "logits/chosen": -1.3890550136566162, + "logits/rejected": -1.3870556354522705, + "logps/chosen": -866.8782958984375, + "logps/rejected": -1034.20458984375, + "loss": 0.4849, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3459925651550293, + "rewards/margins": 1.1013741493225098, + "rewards/rejected": -4.447366237640381, + "step": 1091 + }, + { + "epoch": 0.713201077638991, + "grad_norm": 23.164092211586546, + "learning_rate": 3.4576335607988294e-08, + "logits/chosen": -1.556492567062378, + "logits/rejected": -1.5247762203216553, + "logps/chosen": -825.4620971679688, + "logps/rejected": -909.668701171875, + "loss": 0.452, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7882065773010254, + "rewards/margins": 1.1564209461212158, + "rewards/rejected": -3.944627523422241, + "step": 1092 + }, + { + "epoch": 0.7138541921789534, + "grad_norm": 30.491984391282454, + "learning_rate": 3.44323114502658e-08, + "logits/chosen": -1.4933907985687256, + "logits/rejected": -1.4707281589508057, + "logps/chosen": -831.043701171875, + "logps/rejected": -888.13037109375, + "loss": 0.5366, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.5473837852478027, + "rewards/margins": 0.7864856719970703, + "rewards/rejected": -4.333869934082031, + "step": 1093 + }, + { + "epoch": 0.7145073067189158, + "grad_norm": 26.466125427075724, + "learning_rate": 3.42884984526826e-08, + "logits/chosen": -1.4837186336517334, + "logits/rejected": -1.4462792873382568, + "logps/chosen": -932.162353515625, + "logps/rejected": -1006.1118774414062, + "loss": 0.5344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4717721939086914, + "rewards/margins": 0.9058358073234558, + "rewards/rejected": -4.377608299255371, + "step": 1094 + }, + { + "epoch": 0.7151604212588782, + "grad_norm": 22.114835756620792, + "learning_rate": 3.414489736380423e-08, + "logits/chosen": -1.5490305423736572, + "logits/rejected": -1.528594970703125, + "logps/chosen": -765.060791015625, + "logps/rejected": -864.4068603515625, + "loss": 0.4142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7475967407226562, + "rewards/margins": 1.1565359830856323, + "rewards/rejected": -3.904132843017578, + "step": 1095 + }, + { + "epoch": 0.7158135357988408, + "grad_norm": 13.258696810534204, + "learning_rate": 3.400150893109317e-08, + "logits/chosen": -1.4212427139282227, + "logits/rejected": -1.3869833946228027, + "logps/chosen": -761.0309448242188, + "logps/rejected": -820.0980224609375, + "loss": 0.4784, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5319325923919678, + "rewards/margins": 0.7716701626777649, + "rewards/rejected": -4.30360221862793, + "step": 1096 + }, + { + "epoch": 0.7164666503388032, + "grad_norm": 153.0170543549745, + "learning_rate": 3.385833390090502e-08, + "logits/chosen": -1.4919387102127075, + "logits/rejected": -1.4864999055862427, + "logps/chosen": -876.7687377929688, + "logps/rejected": -904.2010498046875, + "loss": 0.544, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.105478525161743, + "rewards/margins": 0.4873059093952179, + "rewards/rejected": -3.5927841663360596, + "step": 1097 + }, + { + "epoch": 0.7171197648787656, + "grad_norm": 22.73385039619432, + "learning_rate": 3.3715373018484606e-08, + "logits/chosen": -1.5606061220169067, + "logits/rejected": -1.5803030729293823, + "logps/chosen": -864.9635009765625, + "logps/rejected": -979.0385131835938, + "loss": 0.5134, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1329944133758545, + "rewards/margins": 1.058984637260437, + "rewards/rejected": -4.19197940826416, + "step": 1098 + }, + { + "epoch": 0.717772879418728, + "grad_norm": 47.2766708425411, + "learning_rate": 3.357262702796206e-08, + "logits/chosen": -1.5417003631591797, + "logits/rejected": -1.5057967901229858, + "logps/chosen": -882.0269775390625, + "logps/rejected": -921.403076171875, + "loss": 0.4704, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9814200401306152, + "rewards/margins": 0.637610912322998, + "rewards/rejected": -3.619030714035034, + "step": 1099 + }, + { + "epoch": 0.7184259939586906, + "grad_norm": 17.039889063919627, + "learning_rate": 3.343009667234898e-08, + "logits/chosen": -1.4841748476028442, + "logits/rejected": -1.5299304723739624, + "logps/chosen": -809.2041625976562, + "logps/rejected": -876.6251831054688, + "loss": 0.4307, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.4067025184631348, + "rewards/margins": 0.9841042757034302, + "rewards/rejected": -4.390806674957275, + "step": 1100 + }, + { + "epoch": 0.7184259939586906, + "eval_logits/chosen": -1.4911024570465088, + "eval_logits/rejected": -1.4728410243988037, + "eval_logps/chosen": -824.2158813476562, + "eval_logps/rejected": -903.2113037109375, + "eval_loss": 0.498925119638443, + "eval_rewards/accuracies": 0.7609999775886536, + "eval_rewards/chosen": -3.1387054920196533, + "eval_rewards/margins": 0.8710131645202637, + "eval_rewards/rejected": -4.009718418121338, + "eval_runtime": 300.1149, + "eval_samples_per_second": 13.328, + "eval_steps_per_second": 0.833, + "step": 1100 + }, + { + "epoch": 0.719079108498653, + "grad_norm": 16.11848033889973, + "learning_rate": 3.3287782693534566e-08, + "logits/chosen": -1.4810887575149536, + "logits/rejected": -1.4817496538162231, + "logps/chosen": -845.0581665039062, + "logps/rejected": -940.9647827148438, + "loss": 0.5442, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.3775806427001953, + "rewards/margins": 0.7508167624473572, + "rewards/rejected": -4.128397464752197, + "step": 1101 + }, + { + "epoch": 0.7197322230386154, + "grad_norm": 68.65354889922948, + "learning_rate": 3.3145685832281736e-08, + "logits/chosen": -1.5042979717254639, + "logits/rejected": -1.523160457611084, + "logps/chosen": -804.5962524414062, + "logps/rejected": -879.611572265625, + "loss": 0.437, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8270790576934814, + "rewards/margins": 0.7390732765197754, + "rewards/rejected": -3.566152572631836, + "step": 1102 + }, + { + "epoch": 0.7203853375785778, + "grad_norm": 73.57913188242289, + "learning_rate": 3.30038068282233e-08, + "logits/chosen": -1.4858269691467285, + "logits/rejected": -1.440920352935791, + "logps/chosen": -834.6658935546875, + "logps/rejected": -878.7103271484375, + "loss": 0.4838, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6924118995666504, + "rewards/margins": 0.7402853965759277, + "rewards/rejected": -4.432697772979736, + "step": 1103 + }, + { + "epoch": 0.7210384521185403, + "grad_norm": 13.062198655888952, + "learning_rate": 3.286214641985807e-08, + "logits/chosen": -1.5750617980957031, + "logits/rejected": -1.491349458694458, + "logps/chosen": -816.414306640625, + "logps/rejected": -881.9208984375, + "loss": 0.4839, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.108093738555908, + "rewards/margins": 0.7648839950561523, + "rewards/rejected": -3.8729777336120605, + "step": 1104 + }, + { + "epoch": 0.7216915666585028, + "grad_norm": 14.08459741772447, + "learning_rate": 3.272070534454708e-08, + "logits/chosen": -1.5048151016235352, + "logits/rejected": -1.4695175886154175, + "logps/chosen": -810.2633056640625, + "logps/rejected": -856.9027099609375, + "loss": 0.4868, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.639930486679077, + "rewards/margins": 0.8205062747001648, + "rewards/rejected": -3.4604365825653076, + "step": 1105 + }, + { + "epoch": 0.7223446811984652, + "grad_norm": 9.792790670749572, + "learning_rate": 3.2579484338509616e-08, + "logits/chosen": -1.441813349723816, + "logits/rejected": -1.4555586576461792, + "logps/chosen": -759.58154296875, + "logps/rejected": -834.8800659179688, + "loss": 0.4439, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1834728717803955, + "rewards/margins": 0.8506563901901245, + "rewards/rejected": -4.0341291427612305, + "step": 1106 + }, + { + "epoch": 0.7229977957384276, + "grad_norm": 13.42092316503716, + "learning_rate": 3.2438484136819575e-08, + "logits/chosen": -1.5256390571594238, + "logits/rejected": -1.5091501474380493, + "logps/chosen": -882.526123046875, + "logps/rejected": -992.293212890625, + "loss": 0.4902, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3528499603271484, + "rewards/margins": 1.2070109844207764, + "rewards/rejected": -4.559861183166504, + "step": 1107 + }, + { + "epoch": 0.7236509102783901, + "grad_norm": 36.798480052905624, + "learning_rate": 3.22977054734015e-08, + "logits/chosen": -1.612870693206787, + "logits/rejected": -1.6026808023452759, + "logps/chosen": -824.7280883789062, + "logps/rejected": -989.9205322265625, + "loss": 0.5238, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1623008251190186, + "rewards/margins": 1.1107250452041626, + "rewards/rejected": -4.273025989532471, + "step": 1108 + }, + { + "epoch": 0.7243040248183525, + "grad_norm": 15.657306421690263, + "learning_rate": 3.215714908102678e-08, + "logits/chosen": -1.4942344427108765, + "logits/rejected": -1.447746753692627, + "logps/chosen": -790.576904296875, + "logps/rejected": -844.633056640625, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0599842071533203, + "rewards/margins": 0.9251794815063477, + "rewards/rejected": -3.985163450241089, + "step": 1109 + }, + { + "epoch": 0.724957139358315, + "grad_norm": 10.52893767352228, + "learning_rate": 3.201681569130988e-08, + "logits/chosen": -1.4922351837158203, + "logits/rejected": -1.4611746072769165, + "logps/chosen": -874.3091430664062, + "logps/rejected": -943.7155151367188, + "loss": 0.4354, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2651565074920654, + "rewards/margins": 0.8614932894706726, + "rewards/rejected": -4.126649379730225, + "step": 1110 + }, + { + "epoch": 0.7256102538982774, + "grad_norm": 54.5450174448529, + "learning_rate": 3.187670603470451e-08, + "logits/chosen": -1.474999189376831, + "logits/rejected": -1.5109246969223022, + "logps/chosen": -883.6320190429688, + "logps/rejected": -954.03369140625, + "loss": 0.4765, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4104151725769043, + "rewards/margins": 0.9843173027038574, + "rewards/rejected": -4.394732475280762, + "step": 1111 + }, + { + "epoch": 0.7262633684382399, + "grad_norm": 59.388463871448565, + "learning_rate": 3.173682084049979e-08, + "logits/chosen": -1.4505696296691895, + "logits/rejected": -1.458186388015747, + "logps/chosen": -664.27587890625, + "logps/rejected": -750.30078125, + "loss": 0.4635, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.536130428314209, + "rewards/margins": 0.9553031921386719, + "rewards/rejected": -3.49143385887146, + "step": 1112 + }, + { + "epoch": 0.7269164829782023, + "grad_norm": 137.01606102320702, + "learning_rate": 3.159716083681652e-08, + "logits/chosen": -1.6230467557907104, + "logits/rejected": -1.5941567420959473, + "logps/chosen": -926.1805419921875, + "logps/rejected": -998.4340209960938, + "loss": 0.5433, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.0903451442718506, + "rewards/margins": 1.0845158100128174, + "rewards/rejected": -4.174860954284668, + "step": 1113 + }, + { + "epoch": 0.7275695975181647, + "grad_norm": 29.04184199104772, + "learning_rate": 3.1457726750603317e-08, + "logits/chosen": -1.512034296989441, + "logits/rejected": -1.47452712059021, + "logps/chosen": -846.271728515625, + "logps/rejected": -941.9117431640625, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4414303302764893, + "rewards/margins": 1.0037970542907715, + "rewards/rejected": -4.44522762298584, + "step": 1114 + }, + { + "epoch": 0.7282227120581272, + "grad_norm": 67.67268164595161, + "learning_rate": 3.131851930763289e-08, + "logits/chosen": -1.51804518699646, + "logits/rejected": -1.497375726699829, + "logps/chosen": -890.821533203125, + "logps/rejected": -1002.13916015625, + "loss": 0.4851, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.418884515762329, + "rewards/margins": 0.7488983869552612, + "rewards/rejected": -4.167783260345459, + "step": 1115 + }, + { + "epoch": 0.7288758265980897, + "grad_norm": 20.304379891765954, + "learning_rate": 3.1179539232498276e-08, + "logits/chosen": -1.4011988639831543, + "logits/rejected": -1.400657296180725, + "logps/chosen": -691.755615234375, + "logps/rejected": -767.6619873046875, + "loss": 0.5219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7513551712036133, + "rewards/margins": 0.778099775314331, + "rewards/rejected": -3.5294547080993652, + "step": 1116 + }, + { + "epoch": 0.7295289411380521, + "grad_norm": 11.565056384039993, + "learning_rate": 3.104078724860892e-08, + "logits/chosen": -1.6112922430038452, + "logits/rejected": -1.5689594745635986, + "logps/chosen": -833.8433837890625, + "logps/rejected": -893.80615234375, + "loss": 0.514, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4891977310180664, + "rewards/margins": 0.7067195177078247, + "rewards/rejected": -4.195917129516602, + "step": 1117 + }, + { + "epoch": 0.7301820556780145, + "grad_norm": 40.26811950012431, + "learning_rate": 3.090226407818714e-08, + "logits/chosen": -1.5029704570770264, + "logits/rejected": -1.4893207550048828, + "logps/chosen": -858.901611328125, + "logps/rejected": -917.4168090820312, + "loss": 0.5033, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3486008644104004, + "rewards/margins": 0.908878743648529, + "rewards/rejected": -4.257479667663574, + "step": 1118 + }, + { + "epoch": 0.730835170217977, + "grad_norm": 16.207319176406262, + "learning_rate": 3.07639704422642e-08, + "logits/chosen": -1.4990010261535645, + "logits/rejected": -1.4996980428695679, + "logps/chosen": -875.76904296875, + "logps/rejected": -1025.61767578125, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.398207902908325, + "rewards/margins": 1.0922547578811646, + "rewards/rejected": -4.490462303161621, + "step": 1119 + }, + { + "epoch": 0.7314882847579395, + "grad_norm": 33.13246656200166, + "learning_rate": 3.06259070606766e-08, + "logits/chosen": -1.4612716436386108, + "logits/rejected": -1.4551949501037598, + "logps/chosen": -897.0416259765625, + "logps/rejected": -981.119873046875, + "loss": 0.491, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.664212465286255, + "rewards/margins": 0.6127184629440308, + "rewards/rejected": -4.276930809020996, + "step": 1120 + }, + { + "epoch": 0.7321413992979019, + "grad_norm": 27.787496208253557, + "learning_rate": 3.048807465206237e-08, + "logits/chosen": -1.4162070751190186, + "logits/rejected": -1.4611150026321411, + "logps/chosen": -788.558349609375, + "logps/rejected": -879.7010498046875, + "loss": 0.5043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.006227493286133, + "rewards/margins": 1.1710853576660156, + "rewards/rejected": -4.177312850952148, + "step": 1121 + }, + { + "epoch": 0.7327945138378643, + "grad_norm": 31.88982746049559, + "learning_rate": 3.035047393385725e-08, + "logits/chosen": -1.5180845260620117, + "logits/rejected": -1.5133084058761597, + "logps/chosen": -882.9698486328125, + "logps/rejected": -901.154052734375, + "loss": 0.5017, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.381559371948242, + "rewards/margins": 0.6179601550102234, + "rewards/rejected": -3.9995198249816895, + "step": 1122 + }, + { + "epoch": 0.7334476283778267, + "grad_norm": 14.275386018313844, + "learning_rate": 3.021310562229105e-08, + "logits/chosen": -1.424015998840332, + "logits/rejected": -1.391045331954956, + "logps/chosen": -808.16796875, + "logps/rejected": -890.8568115234375, + "loss": 0.4597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.398470163345337, + "rewards/margins": 0.7546467781066895, + "rewards/rejected": -4.1531171798706055, + "step": 1123 + }, + { + "epoch": 0.7341007429177893, + "grad_norm": 31.589588156255736, + "learning_rate": 3.0075970432383824e-08, + "logits/chosen": -1.4777663946151733, + "logits/rejected": -1.4628345966339111, + "logps/chosen": -788.1425170898438, + "logps/rejected": -1018.20751953125, + "loss": 0.4613, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.831998348236084, + "rewards/margins": 1.624790906906128, + "rewards/rejected": -4.456789016723633, + "step": 1124 + }, + { + "epoch": 0.7347538574577517, + "grad_norm": 23.163673831394075, + "learning_rate": 2.993906907794223e-08, + "logits/chosen": -1.4283198118209839, + "logits/rejected": -1.4135866165161133, + "logps/chosen": -848.6114501953125, + "logps/rejected": -875.2060546875, + "loss": 0.4822, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.065795660018921, + "rewards/margins": 0.7749052047729492, + "rewards/rejected": -3.84070086479187, + "step": 1125 + }, + { + "epoch": 0.7354069719977141, + "grad_norm": 66.36390765427439, + "learning_rate": 2.980240227155578e-08, + "logits/chosen": -1.4410760402679443, + "logits/rejected": -1.4437801837921143, + "logps/chosen": -856.7933959960938, + "logps/rejected": -962.9047241210938, + "loss": 0.5124, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.395413875579834, + "rewards/margins": 0.9301176071166992, + "rewards/rejected": -4.325531959533691, + "step": 1126 + }, + { + "epoch": 0.7360600865376765, + "grad_norm": 81.45798747264044, + "learning_rate": 2.9665970724593113e-08, + "logits/chosen": -1.5262151956558228, + "logits/rejected": -1.4462474584579468, + "logps/chosen": -931.7483520507812, + "logps/rejected": -1020.103271484375, + "loss": 0.515, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.550004720687866, + "rewards/margins": 1.2063486576080322, + "rewards/rejected": -4.756353378295898, + "step": 1127 + }, + { + "epoch": 0.736713201077639, + "grad_norm": 35.012242513097036, + "learning_rate": 2.9529775147198323e-08, + "logits/chosen": -1.5010284185409546, + "logits/rejected": -1.511681079864502, + "logps/chosen": -828.48583984375, + "logps/rejected": -907.3237915039062, + "loss": 0.4117, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.0704474449157715, + "rewards/margins": 1.052125096321106, + "rewards/rejected": -4.122572898864746, + "step": 1128 + }, + { + "epoch": 0.7373663156176015, + "grad_norm": 12.390653146331033, + "learning_rate": 2.9393816248287257e-08, + "logits/chosen": -1.4855061769485474, + "logits/rejected": -1.5361738204956055, + "logps/chosen": -775.18359375, + "logps/rejected": -859.5416259765625, + "loss": 0.4782, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.886140823364258, + "rewards/margins": 0.9194518327713013, + "rewards/rejected": -3.8055927753448486, + "step": 1129 + }, + { + "epoch": 0.7380194301575639, + "grad_norm": 41.60514101169407, + "learning_rate": 2.925809473554382e-08, + "logits/chosen": -1.4579628705978394, + "logits/rejected": -1.4414145946502686, + "logps/chosen": -883.7291870117188, + "logps/rejected": -980.4385375976562, + "loss": 0.505, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3500232696533203, + "rewards/margins": 0.7552958726882935, + "rewards/rejected": -4.105319023132324, + "step": 1130 + }, + { + "epoch": 0.7386725446975263, + "grad_norm": 23.290126695482954, + "learning_rate": 2.9122611315416283e-08, + "logits/chosen": -1.5425716638565063, + "logits/rejected": -1.511645793914795, + "logps/chosen": -785.6270141601562, + "logps/rejected": -796.7711181640625, + "loss": 0.4878, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8701109886169434, + "rewards/margins": 0.560005784034729, + "rewards/rejected": -3.430117130279541, + "step": 1131 + }, + { + "epoch": 0.7393256592374887, + "grad_norm": 14.618557708069593, + "learning_rate": 2.898736669311361e-08, + "logits/chosen": -1.5095089673995972, + "logits/rejected": -1.424134373664856, + "logps/chosen": -893.7913818359375, + "logps/rejected": -921.09912109375, + "loss": 0.5096, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3666210174560547, + "rewards/margins": 1.207306981086731, + "rewards/rejected": -4.573927879333496, + "step": 1132 + }, + { + "epoch": 0.7399787737774512, + "grad_norm": 25.284769898933742, + "learning_rate": 2.8852361572601798e-08, + "logits/chosen": -1.4017362594604492, + "logits/rejected": -1.4303860664367676, + "logps/chosen": -785.92041015625, + "logps/rejected": -907.3682861328125, + "loss": 0.4776, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0821828842163086, + "rewards/margins": 1.0643155574798584, + "rewards/rejected": -4.146498680114746, + "step": 1133 + }, + { + "epoch": 0.7406318883174137, + "grad_norm": 83.88894947610976, + "learning_rate": 2.8717596656600207e-08, + "logits/chosen": -1.4224882125854492, + "logits/rejected": -1.4386307001113892, + "logps/chosen": -819.8443603515625, + "logps/rejected": -834.15234375, + "loss": 0.6099, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7083494663238525, + "rewards/margins": 0.4391818642616272, + "rewards/rejected": -4.147531509399414, + "step": 1134 + }, + { + "epoch": 0.7412850028573761, + "grad_norm": 86.50212048050254, + "learning_rate": 2.8583072646577904e-08, + "logits/chosen": -1.5508708953857422, + "logits/rejected": -1.5168962478637695, + "logps/chosen": -878.55712890625, + "logps/rejected": -941.9532470703125, + "loss": 0.4686, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2938575744628906, + "rewards/margins": 0.9149394631385803, + "rewards/rejected": -4.208796977996826, + "step": 1135 + }, + { + "epoch": 0.7419381173973385, + "grad_norm": 46.81398792643234, + "learning_rate": 2.8448790242750002e-08, + "logits/chosen": -1.4926443099975586, + "logits/rejected": -1.4884159564971924, + "logps/chosen": -827.5214233398438, + "logps/rejected": -890.3485107421875, + "loss": 0.5022, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1017537117004395, + "rewards/margins": 0.6141247749328613, + "rewards/rejected": -3.715878486633301, + "step": 1136 + }, + { + "epoch": 0.742591231937301, + "grad_norm": 73.85737704941202, + "learning_rate": 2.831475014407402e-08, + "logits/chosen": -1.5685663223266602, + "logits/rejected": -1.5168462991714478, + "logps/chosen": -825.589599609375, + "logps/rejected": -906.937744140625, + "loss": 0.5298, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.318887948989868, + "rewards/margins": 1.1284462213516235, + "rewards/rejected": -4.447334289550781, + "step": 1137 + }, + { + "epoch": 0.7432443464772635, + "grad_norm": 86.46807131648339, + "learning_rate": 2.8180953048246247e-08, + "logits/chosen": -1.4860353469848633, + "logits/rejected": -1.468380093574524, + "logps/chosen": -782.5503540039062, + "logps/rejected": -827.2178344726562, + "loss": 0.4897, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.859600067138672, + "rewards/margins": 0.6450478434562683, + "rewards/rejected": -3.5046474933624268, + "step": 1138 + }, + { + "epoch": 0.7438974610172259, + "grad_norm": 84.46407466008054, + "learning_rate": 2.8047399651698154e-08, + "logits/chosen": -1.4216095209121704, + "logits/rejected": -1.4595965147018433, + "logps/chosen": -844.6117553710938, + "logps/rejected": -825.7047729492188, + "loss": 0.4965, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.3110671043395996, + "rewards/margins": 0.38550424575805664, + "rewards/rejected": -3.6965713500976562, + "step": 1139 + }, + { + "epoch": 0.7445505755571883, + "grad_norm": 55.02374788341525, + "learning_rate": 2.791409064959262e-08, + "logits/chosen": -1.4333022832870483, + "logits/rejected": -1.3619767427444458, + "logps/chosen": -795.4210205078125, + "logps/rejected": -872.92333984375, + "loss": 0.4726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1887168884277344, + "rewards/margins": 0.8975101113319397, + "rewards/rejected": -4.086226940155029, + "step": 1140 + }, + { + "epoch": 0.7452036900971508, + "grad_norm": 87.71895948569228, + "learning_rate": 2.7781026735820516e-08, + "logits/chosen": -1.5040805339813232, + "logits/rejected": -1.47743821144104, + "logps/chosen": -840.98583984375, + "logps/rejected": -963.7841796875, + "loss": 0.4646, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2599844932556152, + "rewards/margins": 1.1233594417572021, + "rewards/rejected": -4.3833441734313965, + "step": 1141 + }, + { + "epoch": 0.7458568046371132, + "grad_norm": 19.67396573430194, + "learning_rate": 2.7648208602996965e-08, + "logits/chosen": -1.5113935470581055, + "logits/rejected": -1.4451313018798828, + "logps/chosen": -839.17578125, + "logps/rejected": -976.3116455078125, + "loss": 0.4397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.220781087875366, + "rewards/margins": 1.0840184688568115, + "rewards/rejected": -4.304799556732178, + "step": 1142 + }, + { + "epoch": 0.7465099191770757, + "grad_norm": 84.55591225773897, + "learning_rate": 2.751563694245776e-08, + "logits/chosen": -1.480290412902832, + "logits/rejected": -1.5081285238265991, + "logps/chosen": -809.2010498046875, + "logps/rejected": -877.5315551757812, + "loss": 0.4513, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9877707958221436, + "rewards/margins": 0.8733527660369873, + "rewards/rejected": -3.8611233234405518, + "step": 1143 + }, + { + "epoch": 0.7471630337170381, + "grad_norm": 95.02644629261052, + "learning_rate": 2.7383312444255792e-08, + "logits/chosen": -1.5553948879241943, + "logits/rejected": -1.5468087196350098, + "logps/chosen": -876.2579345703125, + "logps/rejected": -1008.3162231445312, + "loss": 0.4952, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.5550930500030518, + "rewards/margins": 1.053817629814148, + "rewards/rejected": -4.60891056060791, + "step": 1144 + }, + { + "epoch": 0.7478161482570006, + "grad_norm": 81.78452279244114, + "learning_rate": 2.7251235797157426e-08, + "logits/chosen": -1.5420262813568115, + "logits/rejected": -1.5177980661392212, + "logps/chosen": -839.2437133789062, + "logps/rejected": -872.8123779296875, + "loss": 0.5823, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.291133403778076, + "rewards/margins": 0.3555396795272827, + "rewards/rejected": -3.6466729640960693, + "step": 1145 + }, + { + "epoch": 0.748469262796963, + "grad_norm": 18.413788007685703, + "learning_rate": 2.7119407688638925e-08, + "logits/chosen": -1.4686810970306396, + "logits/rejected": -1.434198021888733, + "logps/chosen": -887.4147338867188, + "logps/rejected": -914.2786865234375, + "loss": 0.5467, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.379523754119873, + "rewards/margins": 0.6430516839027405, + "rewards/rejected": -4.022575378417969, + "step": 1146 + }, + { + "epoch": 0.7491223773369254, + "grad_norm": 25.04209084197724, + "learning_rate": 2.6987828804882885e-08, + "logits/chosen": -1.473648190498352, + "logits/rejected": -1.4673340320587158, + "logps/chosen": -755.6442260742188, + "logps/rejected": -806.13818359375, + "loss": 0.5137, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.124025344848633, + "rewards/margins": 0.8462156653404236, + "rewards/rejected": -3.970241069793701, + "step": 1147 + }, + { + "epoch": 0.7497754918768879, + "grad_norm": 17.831323248171984, + "learning_rate": 2.6856499830774655e-08, + "logits/chosen": -1.5108214616775513, + "logits/rejected": -1.4946174621582031, + "logps/chosen": -833.5274658203125, + "logps/rejected": -902.3148193359375, + "loss": 0.533, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2335917949676514, + "rewards/margins": 1.1803102493286133, + "rewards/rejected": -4.413901329040527, + "step": 1148 + }, + { + "epoch": 0.7504286064168504, + "grad_norm": 55.940688397054416, + "learning_rate": 2.6725421449898775e-08, + "logits/chosen": -1.5316839218139648, + "logits/rejected": -1.4671372175216675, + "logps/chosen": -804.1160278320312, + "logps/rejected": -807.0818481445312, + "loss": 0.4974, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2132842540740967, + "rewards/margins": 0.4933798611164093, + "rewards/rejected": -3.7066640853881836, + "step": 1149 + }, + { + "epoch": 0.7510817209568128, + "grad_norm": 12.040478011711151, + "learning_rate": 2.6594594344535416e-08, + "logits/chosen": -1.5430724620819092, + "logits/rejected": -1.4982068538665771, + "logps/chosen": -903.4131469726562, + "logps/rejected": -982.6273193359375, + "loss": 0.494, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6371841430664062, + "rewards/margins": 0.9791971445083618, + "rewards/rejected": -4.616381645202637, + "step": 1150 + }, + { + "epoch": 0.7517348354967752, + "grad_norm": 17.536012470211805, + "learning_rate": 2.646401919565679e-08, + "logits/chosen": -1.5547937154769897, + "logits/rejected": -1.578386664390564, + "logps/chosen": -779.9551391601562, + "logps/rejected": -914.7993774414062, + "loss": 0.4548, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0140175819396973, + "rewards/margins": 1.036094307899475, + "rewards/rejected": -4.050111770629883, + "step": 1151 + }, + { + "epoch": 0.7523879500367376, + "grad_norm": 13.483448697030374, + "learning_rate": 2.6333696682923677e-08, + "logits/chosen": -1.478268027305603, + "logits/rejected": -1.4786677360534668, + "logps/chosen": -848.6054077148438, + "logps/rejected": -985.2492065429688, + "loss": 0.3929, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.365396499633789, + "rewards/margins": 1.4922524690628052, + "rewards/rejected": -4.857649326324463, + "step": 1152 + }, + { + "epoch": 0.7530410645767002, + "grad_norm": 17.054706803705002, + "learning_rate": 2.6203627484681862e-08, + "logits/chosen": -1.5397868156433105, + "logits/rejected": -1.4064992666244507, + "logps/chosen": -779.4160766601562, + "logps/rejected": -861.3985595703125, + "loss": 0.5078, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.2905077934265137, + "rewards/margins": 1.0052298307418823, + "rewards/rejected": -4.295737266540527, + "step": 1153 + }, + { + "epoch": 0.7536941791166626, + "grad_norm": 32.79446433639373, + "learning_rate": 2.6073812277958565e-08, + "logits/chosen": -1.4748960733413696, + "logits/rejected": -1.4349013566970825, + "logps/chosen": -851.8651123046875, + "logps/rejected": -946.66357421875, + "loss": 0.4066, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2334001064300537, + "rewards/margins": 0.994164228439331, + "rewards/rejected": -4.227564334869385, + "step": 1154 + }, + { + "epoch": 0.754347293656625, + "grad_norm": 67.74543674863597, + "learning_rate": 2.5944251738458985e-08, + "logits/chosen": -1.4562938213348389, + "logits/rejected": -1.4436331987380981, + "logps/chosen": -778.5364990234375, + "logps/rejected": -855.3143920898438, + "loss": 0.4975, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.058043956756592, + "rewards/margins": 0.7909223437309265, + "rewards/rejected": -3.848966360092163, + "step": 1155 + }, + { + "epoch": 0.7550004081965874, + "grad_norm": 120.90210970935338, + "learning_rate": 2.58149465405627e-08, + "logits/chosen": -1.5155216455459595, + "logits/rejected": -1.5171725749969482, + "logps/chosen": -874.121826171875, + "logps/rejected": -1065.99609375, + "loss": 0.5047, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.261012554168701, + "rewards/margins": 1.1070443391799927, + "rewards/rejected": -4.368056774139404, + "step": 1156 + }, + { + "epoch": 0.75565352273655, + "grad_norm": 22.64305491156398, + "learning_rate": 2.5685897357320236e-08, + "logits/chosen": -1.5339815616607666, + "logits/rejected": -1.534791350364685, + "logps/chosen": -835.3990478515625, + "logps/rejected": -900.058349609375, + "loss": 0.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2667551040649414, + "rewards/margins": 0.9697099328041077, + "rewards/rejected": -4.236464977264404, + "step": 1157 + }, + { + "epoch": 0.7563066372765124, + "grad_norm": 12.582513483094319, + "learning_rate": 2.555710486044951e-08, + "logits/chosen": -1.5386102199554443, + "logits/rejected": -1.504981279373169, + "logps/chosen": -791.6339721679688, + "logps/rejected": -953.6394653320312, + "loss": 0.4794, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0311837196350098, + "rewards/margins": 1.1218692064285278, + "rewards/rejected": -4.153052806854248, + "step": 1158 + }, + { + "epoch": 0.7569597518164748, + "grad_norm": 12.043718502977505, + "learning_rate": 2.542856972033237e-08, + "logits/chosen": -1.5250686407089233, + "logits/rejected": -1.502375602722168, + "logps/chosen": -846.478515625, + "logps/rejected": -939.0611572265625, + "loss": 0.4625, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.351508378982544, + "rewards/margins": 1.0537912845611572, + "rewards/rejected": -4.405299186706543, + "step": 1159 + }, + { + "epoch": 0.7576128663564372, + "grad_norm": 38.38806781229567, + "learning_rate": 2.5300292606011058e-08, + "logits/chosen": -1.334637999534607, + "logits/rejected": -1.3652501106262207, + "logps/chosen": -769.8375854492188, + "logps/rejected": -865.3778076171875, + "loss": 0.4443, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9790191650390625, + "rewards/margins": 0.9127359986305237, + "rewards/rejected": -3.8917548656463623, + "step": 1160 + }, + { + "epoch": 0.7582659808963997, + "grad_norm": 67.81176562059466, + "learning_rate": 2.5172274185184795e-08, + "logits/chosen": -1.5681705474853516, + "logits/rejected": -1.5657100677490234, + "logps/chosen": -860.4470825195312, + "logps/rejected": -908.8406982421875, + "loss": 0.5013, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.02535080909729, + "rewards/margins": 0.7398912906646729, + "rewards/rejected": -3.765242099761963, + "step": 1161 + }, + { + "epoch": 0.7589190954363622, + "grad_norm": 18.366197171597655, + "learning_rate": 2.504451512420624e-08, + "logits/chosen": -1.439697265625, + "logits/rejected": -1.4873735904693604, + "logps/chosen": -856.3161010742188, + "logps/rejected": -1009.01708984375, + "loss": 0.4974, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3951454162597656, + "rewards/margins": 1.1207799911499023, + "rewards/rejected": -4.515925407409668, + "step": 1162 + }, + { + "epoch": 0.7595722099763246, + "grad_norm": 14.372222255787399, + "learning_rate": 2.491701608807807e-08, + "logits/chosen": -1.467217206954956, + "logits/rejected": -1.4398670196533203, + "logps/chosen": -864.7401123046875, + "logps/rejected": -916.0717163085938, + "loss": 0.5183, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.4642720222473145, + "rewards/margins": 0.79485023021698, + "rewards/rejected": -4.259122848510742, + "step": 1163 + }, + { + "epoch": 0.760225324516287, + "grad_norm": 51.64803091036702, + "learning_rate": 2.478977774044948e-08, + "logits/chosen": -1.4573464393615723, + "logits/rejected": -1.450842261314392, + "logps/chosen": -743.4122314453125, + "logps/rejected": -863.0440063476562, + "loss": 0.4459, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0900349617004395, + "rewards/margins": 0.9152263402938843, + "rewards/rejected": -4.005261421203613, + "step": 1164 + }, + { + "epoch": 0.7608784390562495, + "grad_norm": 22.157747485258298, + "learning_rate": 2.466280074361277e-08, + "logits/chosen": -1.5302906036376953, + "logits/rejected": -1.5034409761428833, + "logps/chosen": -872.1870727539062, + "logps/rejected": -945.05859375, + "loss": 0.5473, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.0566864013671875, + "rewards/margins": 0.5899438858032227, + "rewards/rejected": -4.646629810333252, + "step": 1165 + }, + { + "epoch": 0.7615315535962119, + "grad_norm": 61.748046100373145, + "learning_rate": 2.4536085758499845e-08, + "logits/chosen": -1.4410693645477295, + "logits/rejected": -1.4090842008590698, + "logps/chosen": -860.2272338867188, + "logps/rejected": -942.9466552734375, + "loss": 0.5124, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1852986812591553, + "rewards/margins": 1.0137269496917725, + "rewards/rejected": -4.199025630950928, + "step": 1166 + }, + { + "epoch": 0.7621846681361744, + "grad_norm": 111.18614790178633, + "learning_rate": 2.4409633444678828e-08, + "logits/chosen": -1.5321191549301147, + "logits/rejected": -1.5227891206741333, + "logps/chosen": -827.511474609375, + "logps/rejected": -978.70458984375, + "loss": 0.493, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.226242780685425, + "rewards/margins": 1.047170877456665, + "rewards/rejected": -4.27341365814209, + "step": 1167 + }, + { + "epoch": 0.7628377826761368, + "grad_norm": 39.173609596636474, + "learning_rate": 2.42834444603506e-08, + "logits/chosen": -1.4213345050811768, + "logits/rejected": -1.4618169069290161, + "logps/chosen": -748.9151000976562, + "logps/rejected": -865.19775390625, + "loss": 0.5424, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1750364303588867, + "rewards/margins": 0.8123236298561096, + "rewards/rejected": -3.9873600006103516, + "step": 1168 + }, + { + "epoch": 0.7634908972160993, + "grad_norm": 100.75678767672636, + "learning_rate": 2.4157519462345373e-08, + "logits/chosen": -1.494844675064087, + "logits/rejected": -1.3755334615707397, + "logps/chosen": -878.364990234375, + "logps/rejected": -937.23193359375, + "loss": 0.507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4742939472198486, + "rewards/margins": 0.7716997861862183, + "rewards/rejected": -4.245993614196777, + "step": 1169 + }, + { + "epoch": 0.7641440117560617, + "grad_norm": 18.303452723748897, + "learning_rate": 2.4031859106119267e-08, + "logits/chosen": -1.4939923286437988, + "logits/rejected": -1.4547199010849, + "logps/chosen": -811.0922241210938, + "logps/rejected": -918.572021484375, + "loss": 0.5138, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3449172973632812, + "rewards/margins": 1.024104356765747, + "rewards/rejected": -4.369021415710449, + "step": 1170 + }, + { + "epoch": 0.7647971262960241, + "grad_norm": 67.44858894155142, + "learning_rate": 2.3906464045750928e-08, + "logits/chosen": -1.5134905576705933, + "logits/rejected": -1.4933390617370605, + "logps/chosen": -844.356689453125, + "logps/rejected": -925.1680908203125, + "loss": 0.4776, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6389784812927246, + "rewards/margins": 0.7109842300415039, + "rewards/rejected": -4.34996223449707, + "step": 1171 + }, + { + "epoch": 0.7654502408359866, + "grad_norm": 60.64995050523796, + "learning_rate": 2.3781334933938094e-08, + "logits/chosen": -1.5297952890396118, + "logits/rejected": -1.520561695098877, + "logps/chosen": -983.57958984375, + "logps/rejected": -1131.6011962890625, + "loss": 0.4451, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.437025785446167, + "rewards/margins": 1.1520684957504272, + "rewards/rejected": -4.589094161987305, + "step": 1172 + }, + { + "epoch": 0.7661033553759491, + "grad_norm": 61.990214860015534, + "learning_rate": 2.3656472421994215e-08, + "logits/chosen": -1.447062373161316, + "logits/rejected": -1.447553038597107, + "logps/chosen": -779.4403076171875, + "logps/rejected": -957.9571533203125, + "loss": 0.5032, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1557390689849854, + "rewards/margins": 1.0444045066833496, + "rewards/rejected": -4.200143814086914, + "step": 1173 + }, + { + "epoch": 0.7667564699159115, + "grad_norm": 16.149576622820327, + "learning_rate": 2.3531877159844986e-08, + "logits/chosen": -1.5438363552093506, + "logits/rejected": -1.5356152057647705, + "logps/chosen": -883.9613647460938, + "logps/rejected": -928.40087890625, + "loss": 0.5555, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.240826368331909, + "rewards/margins": 0.7265506386756897, + "rewards/rejected": -3.967376947402954, + "step": 1174 + }, + { + "epoch": 0.7674095844558739, + "grad_norm": 12.855340027433474, + "learning_rate": 2.3407549796025138e-08, + "logits/chosen": -1.444447636604309, + "logits/rejected": -1.4474413394927979, + "logps/chosen": -902.9122924804688, + "logps/rejected": -1022.8636474609375, + "loss": 0.4878, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.586104393005371, + "rewards/margins": 1.1032967567443848, + "rewards/rejected": -4.689400672912598, + "step": 1175 + }, + { + "epoch": 0.7680626989958363, + "grad_norm": 44.85596625562831, + "learning_rate": 2.3283490977674887e-08, + "logits/chosen": -1.47477126121521, + "logits/rejected": -1.473871111869812, + "logps/chosen": -713.3693237304688, + "logps/rejected": -823.8683471679688, + "loss": 0.4626, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.159616708755493, + "rewards/margins": 0.8683343529701233, + "rewards/rejected": -4.027950763702393, + "step": 1176 + }, + { + "epoch": 0.7687158135357989, + "grad_norm": 10.37144025860678, + "learning_rate": 2.3159701350536645e-08, + "logits/chosen": -1.4041333198547363, + "logits/rejected": -1.3923746347427368, + "logps/chosen": -832.6809692382812, + "logps/rejected": -847.46923828125, + "loss": 0.4712, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0015857219696045, + "rewards/margins": 0.7252290844917297, + "rewards/rejected": -3.7268147468566895, + "step": 1177 + }, + { + "epoch": 0.7693689280757613, + "grad_norm": 14.75625387529424, + "learning_rate": 2.3036181558951672e-08, + "logits/chosen": -1.5356097221374512, + "logits/rejected": -1.5658423900604248, + "logps/chosen": -829.6064453125, + "logps/rejected": -899.4547729492188, + "loss": 0.4993, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.934075117111206, + "rewards/margins": 1.1105401515960693, + "rewards/rejected": -4.044615268707275, + "step": 1178 + }, + { + "epoch": 0.7700220426157237, + "grad_norm": 11.129241460176539, + "learning_rate": 2.2912932245856683e-08, + "logits/chosen": -1.5327390432357788, + "logits/rejected": -1.4865920543670654, + "logps/chosen": -914.9544067382812, + "logps/rejected": -1039.690185546875, + "loss": 0.431, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.7165744304656982, + "rewards/margins": 1.1967370510101318, + "rewards/rejected": -4.913311958312988, + "step": 1179 + }, + { + "epoch": 0.7706751571556861, + "grad_norm": 27.071377513200186, + "learning_rate": 2.278995405278051e-08, + "logits/chosen": -1.490029215812683, + "logits/rejected": -1.4955387115478516, + "logps/chosen": -788.3419799804688, + "logps/rejected": -908.0980224609375, + "loss": 0.4685, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.865569829940796, + "rewards/margins": 0.890224814414978, + "rewards/rejected": -3.7557942867279053, + "step": 1180 + }, + { + "epoch": 0.7713282716956487, + "grad_norm": 68.89217637019948, + "learning_rate": 2.266724761984077e-08, + "logits/chosen": -1.5287120342254639, + "logits/rejected": -1.5607788562774658, + "logps/chosen": -864.7396240234375, + "logps/rejected": -897.0245971679688, + "loss": 0.5442, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2756667137145996, + "rewards/margins": 0.8783707618713379, + "rewards/rejected": -4.1540374755859375, + "step": 1181 + }, + { + "epoch": 0.7719813862356111, + "grad_norm": 23.549000554913878, + "learning_rate": 2.2544813585740552e-08, + "logits/chosen": -1.5588055849075317, + "logits/rejected": -1.566838264465332, + "logps/chosen": -907.4659423828125, + "logps/rejected": -1011.3638916015625, + "loss": 0.4863, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.550849437713623, + "rewards/margins": 0.8633126616477966, + "rewards/rejected": -4.414162635803223, + "step": 1182 + }, + { + "epoch": 0.7726345007755735, + "grad_norm": 73.6862565187378, + "learning_rate": 2.242265258776505e-08, + "logits/chosen": -1.4739999771118164, + "logits/rejected": -1.4539254903793335, + "logps/chosen": -828.4271850585938, + "logps/rejected": -1103.9161376953125, + "loss": 0.4844, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.812699317932129, + "rewards/margins": 1.4099655151367188, + "rewards/rejected": -4.222664833068848, + "step": 1183 + }, + { + "epoch": 0.7732876153155359, + "grad_norm": 28.50045032866641, + "learning_rate": 2.2300765261778312e-08, + "logits/chosen": -1.4642502069473267, + "logits/rejected": -1.466383457183838, + "logps/chosen": -875.912841796875, + "logps/rejected": -906.390380859375, + "loss": 0.5413, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.574983596801758, + "rewards/margins": 0.3633178770542145, + "rewards/rejected": -3.9383015632629395, + "step": 1184 + }, + { + "epoch": 0.7739407298554984, + "grad_norm": 13.495729369988036, + "learning_rate": 2.2179152242219803e-08, + "logits/chosen": -1.5543307065963745, + "logits/rejected": -1.5425894260406494, + "logps/chosen": -870.7875366210938, + "logps/rejected": -942.445556640625, + "loss": 0.489, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3177812099456787, + "rewards/margins": 0.8684400320053101, + "rewards/rejected": -4.186221599578857, + "step": 1185 + }, + { + "epoch": 0.7745938443954609, + "grad_norm": 121.88098996369544, + "learning_rate": 2.205781416210126e-08, + "logits/chosen": -1.3570061922073364, + "logits/rejected": -1.3827579021453857, + "logps/chosen": -873.6181030273438, + "logps/rejected": -917.5654296875, + "loss": 0.4861, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.150343894958496, + "rewards/margins": 1.0735583305358887, + "rewards/rejected": -4.223902702331543, + "step": 1186 + }, + { + "epoch": 0.7752469589354233, + "grad_norm": 42.77475056245068, + "learning_rate": 2.1936751653003312e-08, + "logits/chosen": -1.525205135345459, + "logits/rejected": -1.5034763813018799, + "logps/chosen": -855.9130249023438, + "logps/rejected": -936.5135498046875, + "loss": 0.4463, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.344566822052002, + "rewards/margins": 0.8852987289428711, + "rewards/rejected": -4.229865550994873, + "step": 1187 + }, + { + "epoch": 0.7759000734753857, + "grad_norm": 22.279778341803244, + "learning_rate": 2.181596534507219e-08, + "logits/chosen": -1.5698206424713135, + "logits/rejected": -1.5208396911621094, + "logps/chosen": -858.2536010742188, + "logps/rejected": -927.1165771484375, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2629714012145996, + "rewards/margins": 0.9225921630859375, + "rewards/rejected": -4.185564041137695, + "step": 1188 + }, + { + "epoch": 0.7765531880153482, + "grad_norm": 79.29971170882058, + "learning_rate": 2.169545586701647e-08, + "logits/chosen": -1.6312425136566162, + "logits/rejected": -1.5835908651351929, + "logps/chosen": -894.0716552734375, + "logps/rejected": -900.3629760742188, + "loss": 0.4934, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.885010242462158, + "rewards/margins": 0.54335618019104, + "rewards/rejected": -4.428366661071777, + "step": 1189 + }, + { + "epoch": 0.7772063025553106, + "grad_norm": 50.17561559841045, + "learning_rate": 2.157522384610379e-08, + "logits/chosen": -1.4537131786346436, + "logits/rejected": -1.4958133697509766, + "logps/chosen": -851.051513671875, + "logps/rejected": -874.2980346679688, + "loss": 0.5582, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4431140422821045, + "rewards/margins": 0.47253862023353577, + "rewards/rejected": -3.9156525135040283, + "step": 1190 + }, + { + "epoch": 0.7778594170952731, + "grad_norm": 16.156948341176815, + "learning_rate": 2.1455269908157583e-08, + "logits/chosen": -1.574291467666626, + "logits/rejected": -1.532688856124878, + "logps/chosen": -930.2337646484375, + "logps/rejected": -1081.06103515625, + "loss": 0.4525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6473445892333984, + "rewards/margins": 1.3627281188964844, + "rewards/rejected": -5.010072708129883, + "step": 1191 + }, + { + "epoch": 0.7785125316352355, + "grad_norm": 18.980973183816936, + "learning_rate": 2.133559467755383e-08, + "logits/chosen": -1.6023908853530884, + "logits/rejected": -1.603822946548462, + "logps/chosen": -828.7919311523438, + "logps/rejected": -913.5147705078125, + "loss": 0.4728, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.939816951751709, + "rewards/margins": 0.9978110790252686, + "rewards/rejected": -3.9376275539398193, + "step": 1192 + }, + { + "epoch": 0.779165646175198, + "grad_norm": 22.58657566222872, + "learning_rate": 2.12161987772178e-08, + "logits/chosen": -1.5595180988311768, + "logits/rejected": -1.5140559673309326, + "logps/chosen": -898.070068359375, + "logps/rejected": -931.6812744140625, + "loss": 0.5008, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8756892681121826, + "rewards/margins": 0.6614542603492737, + "rewards/rejected": -4.537143230438232, + "step": 1193 + }, + { + "epoch": 0.7798187607151604, + "grad_norm": 14.841539963082138, + "learning_rate": 2.1097082828620823e-08, + "logits/chosen": -1.5292487144470215, + "logits/rejected": -1.4865946769714355, + "logps/chosen": -779.00244140625, + "logps/rejected": -803.3477172851562, + "loss": 0.5534, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.951836347579956, + "rewards/margins": 0.46454131603240967, + "rewards/rejected": -3.4163777828216553, + "step": 1194 + }, + { + "epoch": 0.7804718752551228, + "grad_norm": 70.51401475102385, + "learning_rate": 2.0978247451777027e-08, + "logits/chosen": -1.4845449924468994, + "logits/rejected": -1.4704723358154297, + "logps/chosen": -838.1801147460938, + "logps/rejected": -939.9219970703125, + "loss": 0.4543, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.091367244720459, + "rewards/margins": 0.8508425354957581, + "rewards/rejected": -3.9422097206115723, + "step": 1195 + }, + { + "epoch": 0.7811249897950853, + "grad_norm": 39.49476961487304, + "learning_rate": 2.0859693265240133e-08, + "logits/chosen": -1.4192631244659424, + "logits/rejected": -1.4639571905136108, + "logps/chosen": -819.2432861328125, + "logps/rejected": -913.455078125, + "loss": 0.4721, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.472623109817505, + "rewards/margins": 0.8139183521270752, + "rewards/rejected": -4.28654146194458, + "step": 1196 + }, + { + "epoch": 0.7817781043350478, + "grad_norm": 38.127552444665085, + "learning_rate": 2.0741420886100226e-08, + "logits/chosen": -1.4315974712371826, + "logits/rejected": -1.3939564228057861, + "logps/chosen": -748.397705078125, + "logps/rejected": -789.2747802734375, + "loss": 0.609, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.52626895904541, + "rewards/margins": 0.5112132430076599, + "rewards/rejected": -4.037482261657715, + "step": 1197 + }, + { + "epoch": 0.7824312188750102, + "grad_norm": 92.0980308502603, + "learning_rate": 2.0623430929980555e-08, + "logits/chosen": -1.485008955001831, + "logits/rejected": -1.4899412393569946, + "logps/chosen": -1039.5450439453125, + "logps/rejected": -1029.2069091796875, + "loss": 0.5313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7229716777801514, + "rewards/margins": 0.5886335372924805, + "rewards/rejected": -4.311605453491211, + "step": 1198 + }, + { + "epoch": 0.7830843334149726, + "grad_norm": 23.40754516414652, + "learning_rate": 2.0505724011034305e-08, + "logits/chosen": -1.5619934797286987, + "logits/rejected": -1.5442955493927002, + "logps/chosen": -920.8180541992188, + "logps/rejected": -1025.9613037109375, + "loss": 0.4786, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7618021965026855, + "rewards/margins": 0.9860894680023193, + "rewards/rejected": -4.747891426086426, + "step": 1199 + }, + { + "epoch": 0.783737447954935, + "grad_norm": 71.55959913562558, + "learning_rate": 2.0388300741941447e-08, + "logits/chosen": -1.4612364768981934, + "logits/rejected": -1.426287293434143, + "logps/chosen": -804.0591430664062, + "logps/rejected": -901.520751953125, + "loss": 0.5403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.961287260055542, + "rewards/margins": 0.8116233944892883, + "rewards/rejected": -3.7729105949401855, + "step": 1200 + }, + { + "epoch": 0.783737447954935, + "eval_logits/chosen": -1.4822365045547485, + "eval_logits/rejected": -1.464072823524475, + "eval_logps/chosen": -844.524169921875, + "eval_logps/rejected": -927.9747314453125, + "eval_loss": 0.49636971950531006, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -3.3417880535125732, + "eval_rewards/margins": 0.9155648350715637, + "eval_rewards/rejected": -4.257352828979492, + "eval_runtime": 296.4203, + "eval_samples_per_second": 13.494, + "eval_steps_per_second": 0.843, + "step": 1200 + }, + { + "epoch": 0.7843905624948976, + "grad_norm": 38.14391865815223, + "learning_rate": 2.027116173390549e-08, + "logits/chosen": -1.4551496505737305, + "logits/rejected": -1.446352243423462, + "logps/chosen": -821.078369140625, + "logps/rejected": -918.0855712890625, + "loss": 0.4739, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5896596908569336, + "rewards/margins": 0.7162079215049744, + "rewards/rejected": -4.305867671966553, + "step": 1201 + }, + { + "epoch": 0.78504367703486, + "grad_norm": 59.161675430462715, + "learning_rate": 2.015430759665032e-08, + "logits/chosen": -1.5572847127914429, + "logits/rejected": -1.5632448196411133, + "logps/chosen": -812.02294921875, + "logps/rejected": -977.6295776367188, + "loss": 0.4483, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5253820419311523, + "rewards/margins": 1.2742270231246948, + "rewards/rejected": -4.799609184265137, + "step": 1202 + }, + { + "epoch": 0.7856967915748224, + "grad_norm": 96.67396687187629, + "learning_rate": 2.003773893841706e-08, + "logits/chosen": -1.5836585760116577, + "logits/rejected": -1.5667706727981567, + "logps/chosen": -812.0299682617188, + "logps/rejected": -925.529052734375, + "loss": 0.4787, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8535349369049072, + "rewards/margins": 1.1299189329147339, + "rewards/rejected": -3.9834539890289307, + "step": 1203 + }, + { + "epoch": 0.7863499061147848, + "grad_norm": 10.591507916072793, + "learning_rate": 1.9921456365960856e-08, + "logits/chosen": -1.4587793350219727, + "logits/rejected": -1.4299750328063965, + "logps/chosen": -911.44384765625, + "logps/rejected": -1020.269287109375, + "loss": 0.5223, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7610325813293457, + "rewards/margins": 0.6058591604232788, + "rewards/rejected": -4.366891860961914, + "step": 1204 + }, + { + "epoch": 0.7870030206547474, + "grad_norm": 92.25556529129581, + "learning_rate": 1.980546048454776e-08, + "logits/chosen": -1.4399176836013794, + "logits/rejected": -1.4096533060073853, + "logps/chosen": -771.2178955078125, + "logps/rejected": -856.9338989257812, + "loss": 0.5283, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.911550521850586, + "rewards/margins": 0.6725373268127441, + "rewards/rejected": -3.584087610244751, + "step": 1205 + }, + { + "epoch": 0.7876561351947098, + "grad_norm": 50.596463509645794, + "learning_rate": 1.9689751897951532e-08, + "logits/chosen": -1.4841548204421997, + "logits/rejected": -1.4760531187057495, + "logps/chosen": -828.1544799804688, + "logps/rejected": -881.6762084960938, + "loss": 0.5132, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.460951328277588, + "rewards/margins": 0.6589277982711792, + "rewards/rejected": -4.119879245758057, + "step": 1206 + }, + { + "epoch": 0.7883092497346722, + "grad_norm": 16.96643780611066, + "learning_rate": 1.9574331208450577e-08, + "logits/chosen": -1.4726507663726807, + "logits/rejected": -1.5015829801559448, + "logps/chosen": -860.7503051757812, + "logps/rejected": -979.796142578125, + "loss": 0.4893, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2518157958984375, + "rewards/margins": 0.9525402784347534, + "rewards/rejected": -4.2043561935424805, + "step": 1207 + }, + { + "epoch": 0.7889623642746346, + "grad_norm": 17.37151480950942, + "learning_rate": 1.9459199016824668e-08, + "logits/chosen": -1.4686522483825684, + "logits/rejected": -1.4412596225738525, + "logps/chosen": -841.184326171875, + "logps/rejected": -1021.4664306640625, + "loss": 0.4921, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.811629295349121, + "rewards/margins": 1.1526672840118408, + "rewards/rejected": -4.964296340942383, + "step": 1208 + }, + { + "epoch": 0.7896154788145971, + "grad_norm": 15.472923440337157, + "learning_rate": 1.9344355922351986e-08, + "logits/chosen": -1.4205299615859985, + "logits/rejected": -1.4428074359893799, + "logps/chosen": -806.4320678710938, + "logps/rejected": -891.0250854492188, + "loss": 0.4432, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.923144578933716, + "rewards/margins": 1.4412071704864502, + "rewards/rejected": -4.364351272583008, + "step": 1209 + }, + { + "epoch": 0.7902685933545596, + "grad_norm": 67.06746939382018, + "learning_rate": 1.922980252280589e-08, + "logits/chosen": -1.4646856784820557, + "logits/rejected": -1.4149240255355835, + "logps/chosen": -835.4559936523438, + "logps/rejected": -962.0414428710938, + "loss": 0.4964, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.448225498199463, + "rewards/margins": 0.8466680645942688, + "rewards/rejected": -4.294894218444824, + "step": 1210 + }, + { + "epoch": 0.790921707894522, + "grad_norm": 11.275402565937414, + "learning_rate": 1.9115539414451864e-08, + "logits/chosen": -1.4719829559326172, + "logits/rejected": -1.5199016332626343, + "logps/chosen": -804.529541015625, + "logps/rejected": -926.13623046875, + "loss": 0.4676, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2839932441711426, + "rewards/margins": 1.0498989820480347, + "rewards/rejected": -4.333892345428467, + "step": 1211 + }, + { + "epoch": 0.7915748224344844, + "grad_norm": 27.08202842365883, + "learning_rate": 1.9001567192044367e-08, + "logits/chosen": -1.4914042949676514, + "logits/rejected": -1.438066840171814, + "logps/chosen": -764.7699584960938, + "logps/rejected": -883.679443359375, + "loss": 0.4568, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.755364418029785, + "rewards/margins": 0.9105137586593628, + "rewards/rejected": -3.6658782958984375, + "step": 1212 + }, + { + "epoch": 0.7922279369744469, + "grad_norm": 19.72267692530407, + "learning_rate": 1.888788644882376e-08, + "logits/chosen": -1.5127348899841309, + "logits/rejected": -1.491123080253601, + "logps/chosen": -755.7037963867188, + "logps/rejected": -753.5874633789062, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.000194549560547, + "rewards/margins": 0.5762927532196045, + "rewards/rejected": -3.5764873027801514, + "step": 1213 + }, + { + "epoch": 0.7928810515144094, + "grad_norm": 87.91830418107274, + "learning_rate": 1.8774497776513222e-08, + "logits/chosen": -1.4988164901733398, + "logits/rejected": -1.5042667388916016, + "logps/chosen": -846.1057739257812, + "logps/rejected": -972.7584228515625, + "loss": 0.4886, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.0713298320770264, + "rewards/margins": 0.8871229290962219, + "rewards/rejected": -3.9584529399871826, + "step": 1214 + }, + { + "epoch": 0.7935341660543718, + "grad_norm": 40.368754316154686, + "learning_rate": 1.8661401765315665e-08, + "logits/chosen": -1.430102825164795, + "logits/rejected": -1.4247267246246338, + "logps/chosen": -770.588623046875, + "logps/rejected": -842.4027099609375, + "loss": 0.4727, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.137661933898926, + "rewards/margins": 1.112958550453186, + "rewards/rejected": -4.2506208419799805, + "step": 1215 + }, + { + "epoch": 0.7941872805943342, + "grad_norm": 80.49162691211416, + "learning_rate": 1.8548599003910666e-08, + "logits/chosen": -1.4262512922286987, + "logits/rejected": -1.4448204040527344, + "logps/chosen": -881.8290405273438, + "logps/rejected": -979.1732788085938, + "loss": 0.4524, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9086101055145264, + "rewards/margins": 1.3396742343902588, + "rewards/rejected": -5.248284339904785, + "step": 1216 + }, + { + "epoch": 0.7948403951342967, + "grad_norm": 16.198495509148298, + "learning_rate": 1.843609007945138e-08, + "logits/chosen": -1.4633352756500244, + "logits/rejected": -1.462742805480957, + "logps/chosen": -818.7781372070312, + "logps/rejected": -850.6954956054688, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.175035238265991, + "rewards/margins": 0.5016341805458069, + "rewards/rejected": -3.676669120788574, + "step": 1217 + }, + { + "epoch": 0.7954935096742591, + "grad_norm": 30.16649596643903, + "learning_rate": 1.832387557756151e-08, + "logits/chosen": -1.4935240745544434, + "logits/rejected": -1.4943130016326904, + "logps/chosen": -749.455078125, + "logps/rejected": -783.4890747070312, + "loss": 0.5097, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.018079996109009, + "rewards/margins": 0.39417386054992676, + "rewards/rejected": -3.4122540950775146, + "step": 1218 + }, + { + "epoch": 0.7961466242142216, + "grad_norm": 63.990103383436264, + "learning_rate": 1.82119560823323e-08, + "logits/chosen": -1.5611546039581299, + "logits/rejected": -1.5044180154800415, + "logps/chosen": -828.6058349609375, + "logps/rejected": -971.0333862304688, + "loss": 0.5007, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2003324031829834, + "rewards/margins": 1.3617459535598755, + "rewards/rejected": -4.56207799911499, + "step": 1219 + }, + { + "epoch": 0.796799738754184, + "grad_norm": 54.853667872883626, + "learning_rate": 1.8100332176319338e-08, + "logits/chosen": -1.4820096492767334, + "logits/rejected": -1.5054271221160889, + "logps/chosen": -863.5888061523438, + "logps/rejected": -936.4130249023438, + "loss": 0.5115, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.6043381690979004, + "rewards/margins": 0.8906245827674866, + "rewards/rejected": -4.494962692260742, + "step": 1220 + }, + { + "epoch": 0.7974528532941465, + "grad_norm": 14.995625717970094, + "learning_rate": 1.798900444053972e-08, + "logits/chosen": -1.5272332429885864, + "logits/rejected": -1.516082763671875, + "logps/chosen": -958.9482421875, + "logps/rejected": -1058.3154296875, + "loss": 0.4977, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6495161056518555, + "rewards/margins": 0.7711564898490906, + "rewards/rejected": -4.420672416687012, + "step": 1221 + }, + { + "epoch": 0.7981059678341089, + "grad_norm": 17.49568930582209, + "learning_rate": 1.7877973454468918e-08, + "logits/chosen": -1.5633349418640137, + "logits/rejected": -1.5596158504486084, + "logps/chosen": -835.0476684570312, + "logps/rejected": -886.090087890625, + "loss": 0.4917, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4766788482666016, + "rewards/margins": 0.8626784682273865, + "rewards/rejected": -4.339357376098633, + "step": 1222 + }, + { + "epoch": 0.7987590823740713, + "grad_norm": 52.96090556956152, + "learning_rate": 1.7767239796037765e-08, + "logits/chosen": -1.4490398168563843, + "logits/rejected": -1.443628191947937, + "logps/chosen": -802.57373046875, + "logps/rejected": -889.3080444335938, + "loss": 0.4679, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.239607334136963, + "rewards/margins": 1.018710970878601, + "rewards/rejected": -4.258317947387695, + "step": 1223 + }, + { + "epoch": 0.7994121969140338, + "grad_norm": 150.8059871681939, + "learning_rate": 1.7656804041629487e-08, + "logits/chosen": -1.5590651035308838, + "logits/rejected": -1.5880131721496582, + "logps/chosen": -928.2868041992188, + "logps/rejected": -929.5325317382812, + "loss": 0.527, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.8474342823028564, + "rewards/margins": 0.5773695111274719, + "rewards/rejected": -4.424803733825684, + "step": 1224 + }, + { + "epoch": 0.8000653114539963, + "grad_norm": 35.966169524301534, + "learning_rate": 1.7546666766076656e-08, + "logits/chosen": -1.449623465538025, + "logits/rejected": -1.5213428735733032, + "logps/chosen": -757.2681884765625, + "logps/rejected": -924.322509765625, + "loss": 0.4752, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0022165775299072, + "rewards/margins": 0.9503800868988037, + "rewards/rejected": -3.952596664428711, + "step": 1225 + }, + { + "epoch": 0.8007184259939587, + "grad_norm": 56.42831654190206, + "learning_rate": 1.743682854265825e-08, + "logits/chosen": -1.4219781160354614, + "logits/rejected": -1.466795802116394, + "logps/chosen": -840.7289428710938, + "logps/rejected": -965.5621337890625, + "loss": 0.4994, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4919745922088623, + "rewards/margins": 0.8413702249526978, + "rewards/rejected": -4.333344459533691, + "step": 1226 + }, + { + "epoch": 0.8013715405339211, + "grad_norm": 73.37249453736422, + "learning_rate": 1.732728994309661e-08, + "logits/chosen": -1.5576728582382202, + "logits/rejected": -1.5214743614196777, + "logps/chosen": -763.7567749023438, + "logps/rejected": -810.2583618164062, + "loss": 0.4673, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8903627395629883, + "rewards/margins": 0.7463845610618591, + "rewards/rejected": -3.636747360229492, + "step": 1227 + }, + { + "epoch": 0.8020246550738835, + "grad_norm": 61.87692158071728, + "learning_rate": 1.7218051537554536e-08, + "logits/chosen": -1.5665578842163086, + "logits/rejected": -1.549164891242981, + "logps/chosen": -867.8675537109375, + "logps/rejected": -983.275634765625, + "loss": 0.4786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3755083084106445, + "rewards/margins": 0.9794182777404785, + "rewards/rejected": -4.354927062988281, + "step": 1228 + }, + { + "epoch": 0.8026777696138461, + "grad_norm": 45.704835797476456, + "learning_rate": 1.7109113894632233e-08, + "logits/chosen": -1.4501968622207642, + "logits/rejected": -1.470442771911621, + "logps/chosen": -766.7777099609375, + "logps/rejected": -880.814453125, + "loss": 0.5157, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.088731288909912, + "rewards/margins": 0.6824835538864136, + "rewards/rejected": -3.7712154388427734, + "step": 1229 + }, + { + "epoch": 0.8033308841538085, + "grad_norm": 12.041002229288496, + "learning_rate": 1.700047758136443e-08, + "logits/chosen": -1.4775810241699219, + "logits/rejected": -1.4621860980987549, + "logps/chosen": -792.953857421875, + "logps/rejected": -907.03759765625, + "loss": 0.4903, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9522554874420166, + "rewards/margins": 1.064509391784668, + "rewards/rejected": -4.016765117645264, + "step": 1230 + }, + { + "epoch": 0.8039839986937709, + "grad_norm": 63.50098008198583, + "learning_rate": 1.689214316321739e-08, + "logits/chosen": -1.6399027109146118, + "logits/rejected": -1.5657185316085815, + "logps/chosen": -886.7124633789062, + "logps/rejected": -979.0396728515625, + "loss": 0.5165, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.5052478313446045, + "rewards/margins": 1.1115096807479858, + "rewards/rejected": -4.616757869720459, + "step": 1231 + }, + { + "epoch": 0.8046371132337333, + "grad_norm": 34.66004736818009, + "learning_rate": 1.678411120408595e-08, + "logits/chosen": -1.4924274682998657, + "logits/rejected": -1.4871619939804077, + "logps/chosen": -876.4576416015625, + "logps/rejected": -956.4109497070312, + "loss": 0.4692, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.6071510314941406, + "rewards/margins": 1.06039297580719, + "rewards/rejected": -4.667544364929199, + "step": 1232 + }, + { + "epoch": 0.8052902277736959, + "grad_norm": 43.61205378322202, + "learning_rate": 1.6676382266290647e-08, + "logits/chosen": -1.4874005317687988, + "logits/rejected": -1.5195080041885376, + "logps/chosen": -837.166015625, + "logps/rejected": -892.5218505859375, + "loss": 0.5412, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4187517166137695, + "rewards/margins": 0.5736494660377502, + "rewards/rejected": -3.992400884628296, + "step": 1233 + }, + { + "epoch": 0.8059433423136583, + "grad_norm": 98.81263517998161, + "learning_rate": 1.6568956910574712e-08, + "logits/chosen": -1.530608892440796, + "logits/rejected": -1.5279160737991333, + "logps/chosen": -837.9874877929688, + "logps/rejected": -845.3988647460938, + "loss": 0.5812, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.741583824157715, + "rewards/margins": 0.040613558143377304, + "rewards/rejected": -3.7821972370147705, + "step": 1234 + }, + { + "epoch": 0.8065964568536207, + "grad_norm": 78.86885176654913, + "learning_rate": 1.6461835696101227e-08, + "logits/chosen": -1.4850387573242188, + "logits/rejected": -1.4923676252365112, + "logps/chosen": -871.99658203125, + "logps/rejected": -949.8641357421875, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7873427867889404, + "rewards/margins": 0.7236688733100891, + "rewards/rejected": -4.511011600494385, + "step": 1235 + }, + { + "epoch": 0.8072495713935831, + "grad_norm": 35.00941246017448, + "learning_rate": 1.6355019180450148e-08, + "logits/chosen": -1.555824637413025, + "logits/rejected": -1.4870938062667847, + "logps/chosen": -837.19140625, + "logps/rejected": -921.0797729492188, + "loss": 0.4621, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3375821113586426, + "rewards/margins": 0.9209874272346497, + "rewards/rejected": -4.258569717407227, + "step": 1236 + }, + { + "epoch": 0.8079026859335456, + "grad_norm": 90.40912979516071, + "learning_rate": 1.6248507919615452e-08, + "logits/chosen": -1.4929189682006836, + "logits/rejected": -1.5122658014297485, + "logps/chosen": -916.9654541015625, + "logps/rejected": -1060.645751953125, + "loss": 0.42, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.5597081184387207, + "rewards/margins": 1.2362518310546875, + "rewards/rejected": -4.795959949493408, + "step": 1237 + }, + { + "epoch": 0.8085558004735081, + "grad_norm": 14.457346050782789, + "learning_rate": 1.6142302468002227e-08, + "logits/chosen": -1.4068070650100708, + "logits/rejected": -1.399592638015747, + "logps/chosen": -771.3148193359375, + "logps/rejected": -843.7901611328125, + "loss": 0.5132, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3284435272216797, + "rewards/margins": 0.8365193605422974, + "rewards/rejected": -4.164963245391846, + "step": 1238 + }, + { + "epoch": 0.8092089150134705, + "grad_norm": 37.50589751809788, + "learning_rate": 1.603640337842377e-08, + "logits/chosen": -1.5218846797943115, + "logits/rejected": -1.5328527688980103, + "logps/chosen": -876.4766235351562, + "logps/rejected": -1026.4345703125, + "loss": 0.4891, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1716041564941406, + "rewards/margins": 0.8838239312171936, + "rewards/rejected": -4.0554280281066895, + "step": 1239 + }, + { + "epoch": 0.8098620295534329, + "grad_norm": 39.42416181441758, + "learning_rate": 1.5930811202098737e-08, + "logits/chosen": -1.451581597328186, + "logits/rejected": -1.4357130527496338, + "logps/chosen": -736.4873046875, + "logps/rejected": -854.6323852539062, + "loss": 0.5309, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.231121778488159, + "rewards/margins": 0.7173824310302734, + "rewards/rejected": -3.9485039710998535, + "step": 1240 + }, + { + "epoch": 0.8105151440933954, + "grad_norm": 15.457594936787636, + "learning_rate": 1.5825526488648268e-08, + "logits/chosen": -1.5039798021316528, + "logits/rejected": -1.4966380596160889, + "logps/chosen": -838.3843994140625, + "logps/rejected": -906.8178100585938, + "loss": 0.5552, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.452669620513916, + "rewards/margins": 0.6774091124534607, + "rewards/rejected": -4.1300787925720215, + "step": 1241 + }, + { + "epoch": 0.8111682586333578, + "grad_norm": 30.649527114852525, + "learning_rate": 1.572054978609306e-08, + "logits/chosen": -1.4918582439422607, + "logits/rejected": -1.454628586769104, + "logps/chosen": -912.1357421875, + "logps/rejected": -949.4915161132812, + "loss": 0.5541, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9472150802612305, + "rewards/margins": 0.8809834122657776, + "rewards/rejected": -4.828197956085205, + "step": 1242 + }, + { + "epoch": 0.8118213731733203, + "grad_norm": 15.819803195866424, + "learning_rate": 1.5615881640850653e-08, + "logits/chosen": -1.4457621574401855, + "logits/rejected": -1.3891899585723877, + "logps/chosen": -806.92626953125, + "logps/rejected": -855.1336669921875, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.198312282562256, + "rewards/margins": 1.005974531173706, + "rewards/rejected": -4.204287052154541, + "step": 1243 + }, + { + "epoch": 0.8124744877132827, + "grad_norm": 20.662834289028424, + "learning_rate": 1.551152259773245e-08, + "logits/chosen": -1.4745066165924072, + "logits/rejected": -1.4477360248565674, + "logps/chosen": -888.340576171875, + "logps/rejected": -906.5769653320312, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.483821392059326, + "rewards/margins": 0.7100880146026611, + "rewards/rejected": -4.193909168243408, + "step": 1244 + }, + { + "epoch": 0.8131276022532452, + "grad_norm": 40.934726173262696, + "learning_rate": 1.5407473199940978e-08, + "logits/chosen": -1.5310475826263428, + "logits/rejected": -1.5288856029510498, + "logps/chosen": -761.62451171875, + "logps/rejected": -932.880126953125, + "loss": 0.4466, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.145474433898926, + "rewards/margins": 1.0549681186676025, + "rewards/rejected": -4.200442790985107, + "step": 1245 + }, + { + "epoch": 0.8137807167932076, + "grad_norm": 59.614420768209214, + "learning_rate": 1.5303733989066992e-08, + "logits/chosen": -1.5001609325408936, + "logits/rejected": -1.4439678192138672, + "logps/chosen": -862.7713623046875, + "logps/rejected": -916.8041381835938, + "loss": 0.5735, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4643964767456055, + "rewards/margins": 0.6968106627464294, + "rewards/rejected": -4.16120719909668, + "step": 1246 + }, + { + "epoch": 0.81443383133317, + "grad_norm": 99.7769624540917, + "learning_rate": 1.5200305505086678e-08, + "logits/chosen": -1.451883316040039, + "logits/rejected": -1.4290595054626465, + "logps/chosen": -822.1856689453125, + "logps/rejected": -901.4767456054688, + "loss": 0.547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.326678514480591, + "rewards/margins": 0.7827971577644348, + "rewards/rejected": -4.109475612640381, + "step": 1247 + }, + { + "epoch": 0.8150869458731325, + "grad_norm": 35.074610634798226, + "learning_rate": 1.509718828635887e-08, + "logits/chosen": -1.4157919883728027, + "logits/rejected": -1.437688946723938, + "logps/chosen": -747.04638671875, + "logps/rejected": -875.550537109375, + "loss": 0.4635, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.111088991165161, + "rewards/margins": 0.8460332155227661, + "rewards/rejected": -3.9571218490600586, + "step": 1248 + }, + { + "epoch": 0.815740060413095, + "grad_norm": 31.080617936177795, + "learning_rate": 1.4994382869622212e-08, + "logits/chosen": -1.5038199424743652, + "logits/rejected": -1.5081907510757446, + "logps/chosen": -842.7953491210938, + "logps/rejected": -893.8030395507812, + "loss": 0.5325, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.6291537284851074, + "rewards/margins": 0.5055683851242065, + "rewards/rejected": -4.134722709655762, + "step": 1249 + }, + { + "epoch": 0.8163931749530574, + "grad_norm": 72.9364075543982, + "learning_rate": 1.4891889789992385e-08, + "logits/chosen": -1.4778282642364502, + "logits/rejected": -1.4801677465438843, + "logps/chosen": -810.5230712890625, + "logps/rejected": -921.8294067382812, + "loss": 0.504, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.3633217811584473, + "rewards/margins": 1.1871894598007202, + "rewards/rejected": -4.550510883331299, + "step": 1250 + }, + { + "epoch": 0.8170462894930198, + "grad_norm": 25.634012106471616, + "learning_rate": 1.4789709580959304e-08, + "logits/chosen": -1.4490480422973633, + "logits/rejected": -1.4597145318984985, + "logps/chosen": -787.5851440429688, + "logps/rejected": -899.1139526367188, + "loss": 0.5296, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.343380928039551, + "rewards/margins": 0.7966138124465942, + "rewards/rejected": -4.139995098114014, + "step": 1251 + }, + { + "epoch": 0.8176994040329822, + "grad_norm": 13.077479982264203, + "learning_rate": 1.4687842774384366e-08, + "logits/chosen": -1.4068443775177002, + "logits/rejected": -1.4268113374710083, + "logps/chosen": -732.1669921875, + "logps/rejected": -751.791259765625, + "loss": 0.5786, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2558491230010986, + "rewards/margins": 0.2685055732727051, + "rewards/rejected": -3.5243544578552246, + "step": 1252 + }, + { + "epoch": 0.8183525185729448, + "grad_norm": 96.37113725713898, + "learning_rate": 1.4586289900497672e-08, + "logits/chosen": -1.5462439060211182, + "logits/rejected": -1.519241452217102, + "logps/chosen": -875.8187866210938, + "logps/rejected": -946.7884521484375, + "loss": 0.5182, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.811178207397461, + "rewards/margins": 1.0562658309936523, + "rewards/rejected": -4.867443561553955, + "step": 1253 + }, + { + "epoch": 0.8190056331129072, + "grad_norm": 47.14222768313448, + "learning_rate": 1.4485051487895208e-08, + "logits/chosen": -1.6328545808792114, + "logits/rejected": -1.5723670721054077, + "logps/chosen": -878.3783569335938, + "logps/rejected": -1000.5723876953125, + "loss": 0.4553, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.133354902267456, + "rewards/margins": 1.4496155977249146, + "rewards/rejected": -4.58297061920166, + "step": 1254 + }, + { + "epoch": 0.8196587476528696, + "grad_norm": 77.13494040128981, + "learning_rate": 1.4384128063536215e-08, + "logits/chosen": -1.5092742443084717, + "logits/rejected": -1.4484039545059204, + "logps/chosen": -795.540283203125, + "logps/rejected": -922.9327392578125, + "loss": 0.5052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0639872550964355, + "rewards/margins": 0.9547647833824158, + "rewards/rejected": -4.018751621246338, + "step": 1255 + }, + { + "epoch": 0.820311862192832, + "grad_norm": 27.794467326470485, + "learning_rate": 1.4283520152740358e-08, + "logits/chosen": -1.527742624282837, + "logits/rejected": -1.550565481185913, + "logps/chosen": -901.008056640625, + "logps/rejected": -977.6214599609375, + "loss": 0.443, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6517744064331055, + "rewards/margins": 0.9718442559242249, + "rewards/rejected": -4.6236186027526855, + "step": 1256 + }, + { + "epoch": 0.8209649767327946, + "grad_norm": 48.23537245171485, + "learning_rate": 1.4183228279184986e-08, + "logits/chosen": -1.4986786842346191, + "logits/rejected": -1.505009651184082, + "logps/chosen": -780.6826171875, + "logps/rejected": -851.0025024414062, + "loss": 0.4116, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2454466819763184, + "rewards/margins": 0.8366904258728027, + "rewards/rejected": -4.082137107849121, + "step": 1257 + }, + { + "epoch": 0.821618091272757, + "grad_norm": 25.124612077309788, + "learning_rate": 1.4083252964902476e-08, + "logits/chosen": -1.4318965673446655, + "logits/rejected": -1.4011112451553345, + "logps/chosen": -767.1181640625, + "logps/rejected": -834.265380859375, + "loss": 0.4421, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2802295684814453, + "rewards/margins": 0.9027811884880066, + "rewards/rejected": -4.183011054992676, + "step": 1258 + }, + { + "epoch": 0.8222712058127194, + "grad_norm": 28.873103251905075, + "learning_rate": 1.3983594730277437e-08, + "logits/chosen": -1.504234790802002, + "logits/rejected": -1.4638817310333252, + "logps/chosen": -840.501220703125, + "logps/rejected": -915.8950805664062, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.346611976623535, + "rewards/margins": 1.1296738386154175, + "rewards/rejected": -4.4762864112854, + "step": 1259 + }, + { + "epoch": 0.8229243203526818, + "grad_norm": 92.65936407387743, + "learning_rate": 1.388425409404406e-08, + "logits/chosen": -1.3982869386672974, + "logits/rejected": -1.4056661128997803, + "logps/chosen": -882.9879150390625, + "logps/rejected": -926.9945068359375, + "loss": 0.5427, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.3799350261688232, + "rewards/margins": 0.531534731388092, + "rewards/rejected": -3.9114699363708496, + "step": 1260 + }, + { + "epoch": 0.8235774348926443, + "grad_norm": 21.267825506734145, + "learning_rate": 1.378523157328338e-08, + "logits/chosen": -1.463804006576538, + "logits/rejected": -1.4482476711273193, + "logps/chosen": -843.3424682617188, + "logps/rejected": -922.429931640625, + "loss": 0.4699, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.516786813735962, + "rewards/margins": 0.4834230840206146, + "rewards/rejected": -4.000209808349609, + "step": 1261 + }, + { + "epoch": 0.8242305494326068, + "grad_norm": 44.39704770318735, + "learning_rate": 1.3686527683420598e-08, + "logits/chosen": -1.5227603912353516, + "logits/rejected": -1.4986686706542969, + "logps/chosen": -880.2915649414062, + "logps/rejected": -956.4801025390625, + "loss": 0.4738, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.749066114425659, + "rewards/margins": 0.984779417514801, + "rewards/rejected": -4.7338457107543945, + "step": 1262 + }, + { + "epoch": 0.8248836639725692, + "grad_norm": 40.800361223632876, + "learning_rate": 1.3588142938222421e-08, + "logits/chosen": -1.5330525636672974, + "logits/rejected": -1.5526435375213623, + "logps/chosen": -796.693603515625, + "logps/rejected": -926.6709594726562, + "loss": 0.4648, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9938881397247314, + "rewards/margins": 1.054705262184143, + "rewards/rejected": -4.048593521118164, + "step": 1263 + }, + { + "epoch": 0.8255367785125316, + "grad_norm": 16.08173399476035, + "learning_rate": 1.3490077849794333e-08, + "logits/chosen": -1.5841691493988037, + "logits/rejected": -1.5560128688812256, + "logps/chosen": -929.83056640625, + "logps/rejected": -988.2870483398438, + "loss": 0.4453, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5736300945281982, + "rewards/margins": 0.8916316032409668, + "rewards/rejected": -4.465262413024902, + "step": 1264 + }, + { + "epoch": 0.8261898930524941, + "grad_norm": 18.753303938177545, + "learning_rate": 1.3392332928577994e-08, + "logits/chosen": -1.5194913148880005, + "logits/rejected": -1.525078535079956, + "logps/chosen": -871.6900024414062, + "logps/rejected": -965.0679931640625, + "loss": 0.4307, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.447103977203369, + "rewards/margins": 0.7160016298294067, + "rewards/rejected": -4.163105487823486, + "step": 1265 + }, + { + "epoch": 0.8268430075924565, + "grad_norm": 37.23704455866974, + "learning_rate": 1.3294908683348535e-08, + "logits/chosen": -1.4927794933319092, + "logits/rejected": -1.459194302558899, + "logps/chosen": -876.373046875, + "logps/rejected": -885.762939453125, + "loss": 0.5455, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8211159706115723, + "rewards/margins": 0.5798470973968506, + "rewards/rejected": -4.400962829589844, + "step": 1266 + }, + { + "epoch": 0.827496122132419, + "grad_norm": 41.88538792702506, + "learning_rate": 1.3197805621211925e-08, + "logits/chosen": -1.4792529344558716, + "logits/rejected": -1.4594306945800781, + "logps/chosen": -812.187744140625, + "logps/rejected": -929.7012939453125, + "loss": 0.475, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.358401298522949, + "rewards/margins": 0.9242434501647949, + "rewards/rejected": -4.282644748687744, + "step": 1267 + }, + { + "epoch": 0.8281492366723814, + "grad_norm": 143.83646829408002, + "learning_rate": 1.3101024247602339e-08, + "logits/chosen": -1.3687011003494263, + "logits/rejected": -1.416597843170166, + "logps/chosen": -873.91796875, + "logps/rejected": -925.439453125, + "loss": 0.501, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.801042318344116, + "rewards/margins": 0.3380916118621826, + "rewards/rejected": -4.139133930206299, + "step": 1268 + }, + { + "epoch": 0.8288023512123439, + "grad_norm": 12.16391760621639, + "learning_rate": 1.3004565066279519e-08, + "logits/chosen": -1.5240532159805298, + "logits/rejected": -1.5018377304077148, + "logps/chosen": -785.153076171875, + "logps/rejected": -848.844970703125, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3579812049865723, + "rewards/margins": 0.8355432748794556, + "rewards/rejected": -4.193524360656738, + "step": 1269 + }, + { + "epoch": 0.8294554657523063, + "grad_norm": 12.97712151020145, + "learning_rate": 1.2908428579326159e-08, + "logits/chosen": -1.483147144317627, + "logits/rejected": -1.440002202987671, + "logps/chosen": -855.696044921875, + "logps/rejected": -937.5626831054688, + "loss": 0.513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.419983148574829, + "rewards/margins": 1.2121243476867676, + "rewards/rejected": -4.632107734680176, + "step": 1270 + }, + { + "epoch": 0.8301085802922687, + "grad_norm": 61.10324755914619, + "learning_rate": 1.2812615287145276e-08, + "logits/chosen": -1.4333446025848389, + "logits/rejected": -1.4426469802856445, + "logps/chosen": -779.0205078125, + "logps/rejected": -882.3414306640625, + "loss": 0.5547, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4126522541046143, + "rewards/margins": 0.8153400421142578, + "rewards/rejected": -4.227993011474609, + "step": 1271 + }, + { + "epoch": 0.8307616948322312, + "grad_norm": 49.65538467582004, + "learning_rate": 1.2717125688457627e-08, + "logits/chosen": -1.4278748035430908, + "logits/rejected": -1.4869357347488403, + "logps/chosen": -847.4019165039062, + "logps/rejected": -1110.864501953125, + "loss": 0.5348, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.525916576385498, + "rewards/margins": 1.2023837566375732, + "rewards/rejected": -4.728300094604492, + "step": 1272 + }, + { + "epoch": 0.8314148093721937, + "grad_norm": 21.25568560854086, + "learning_rate": 1.2621960280299093e-08, + "logits/chosen": -1.498213529586792, + "logits/rejected": -1.5190093517303467, + "logps/chosen": -889.1369018554688, + "logps/rejected": -935.4756469726562, + "loss": 0.5172, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5845720767974854, + "rewards/margins": 1.0147137641906738, + "rewards/rejected": -4.599286079406738, + "step": 1273 + }, + { + "epoch": 0.8320679239121561, + "grad_norm": 20.186165187940464, + "learning_rate": 1.252711955801811e-08, + "logits/chosen": -1.4972944259643555, + "logits/rejected": -1.515435814857483, + "logps/chosen": -775.0640869140625, + "logps/rejected": -830.6438598632812, + "loss": 0.4944, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7457919120788574, + "rewards/margins": 0.6729423999786377, + "rewards/rejected": -3.418734073638916, + "step": 1274 + }, + { + "epoch": 0.8327210384521185, + "grad_norm": 80.31553260847888, + "learning_rate": 1.2432604015273082e-08, + "logits/chosen": -1.5685982704162598, + "logits/rejected": -1.5332074165344238, + "logps/chosen": -850.53076171875, + "logps/rejected": -1026.168701171875, + "loss": 0.4492, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.52174973487854, + "rewards/margins": 1.3801482915878296, + "rewards/rejected": -4.901898384094238, + "step": 1275 + }, + { + "epoch": 0.833374152992081, + "grad_norm": 44.67438081658325, + "learning_rate": 1.2338414144029779e-08, + "logits/chosen": -1.4650599956512451, + "logits/rejected": -1.5019781589508057, + "logps/chosen": -831.616455078125, + "logps/rejected": -865.5159912109375, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3090555667877197, + "rewards/margins": 0.5248807668685913, + "rewards/rejected": -3.8339362144470215, + "step": 1276 + }, + { + "epoch": 0.8340272675320435, + "grad_norm": 39.396696943484436, + "learning_rate": 1.2244550434558842e-08, + "logits/chosen": -1.461850643157959, + "logits/rejected": -1.453994631767273, + "logps/chosen": -843.3770751953125, + "logps/rejected": -1038.049072265625, + "loss": 0.5168, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4972283840179443, + "rewards/margins": 1.6629681587219238, + "rewards/rejected": -5.160196781158447, + "step": 1277 + }, + { + "epoch": 0.8346803820720059, + "grad_norm": 65.0379831567537, + "learning_rate": 1.2151013375433202e-08, + "logits/chosen": -1.553591012954712, + "logits/rejected": -1.507093906402588, + "logps/chosen": -870.8557739257812, + "logps/rejected": -965.6985473632812, + "loss": 0.4459, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.6408274173736572, + "rewards/margins": 1.1605440378189087, + "rewards/rejected": -4.801371097564697, + "step": 1278 + }, + { + "epoch": 0.8353334966119683, + "grad_norm": 30.676217173011974, + "learning_rate": 1.2057803453525502e-08, + "logits/chosen": -1.468326449394226, + "logits/rejected": -1.4737112522125244, + "logps/chosen": -782.1486206054688, + "logps/rejected": -842.2639770507812, + "loss": 0.5042, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.191077709197998, + "rewards/margins": 0.6720733046531677, + "rewards/rejected": -3.8631508350372314, + "step": 1279 + }, + { + "epoch": 0.8359866111519307, + "grad_norm": 107.16302969733803, + "learning_rate": 1.1964921154005631e-08, + "logits/chosen": -1.4318448305130005, + "logits/rejected": -1.4438923597335815, + "logps/chosen": -870.4271850585938, + "logps/rejected": -939.97900390625, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3240041732788086, + "rewards/margins": 0.9650790095329285, + "rewards/rejected": -4.289083003997803, + "step": 1280 + }, + { + "epoch": 0.8366397256918933, + "grad_norm": 37.456288698932234, + "learning_rate": 1.187236696033812e-08, + "logits/chosen": -1.5122158527374268, + "logits/rejected": -1.454342007637024, + "logps/chosen": -791.8545532226562, + "logps/rejected": -874.0135498046875, + "loss": 0.4348, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.786623239517212, + "rewards/margins": 0.9403093457221985, + "rewards/rejected": -3.7269325256347656, + "step": 1281 + }, + { + "epoch": 0.8372928402318557, + "grad_norm": 32.26978309244302, + "learning_rate": 1.1780141354279698e-08, + "logits/chosen": -1.4852337837219238, + "logits/rejected": -1.4611694812774658, + "logps/chosen": -899.8671875, + "logps/rejected": -902.857666015625, + "loss": 0.4584, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5996108055114746, + "rewards/margins": 0.4853794276714325, + "rewards/rejected": -4.08499002456665, + "step": 1282 + }, + { + "epoch": 0.8379459547718181, + "grad_norm": 67.20483755586449, + "learning_rate": 1.1688244815876735e-08, + "logits/chosen": -1.4753847122192383, + "logits/rejected": -1.495114803314209, + "logps/chosen": -775.127685546875, + "logps/rejected": -854.59423828125, + "loss": 0.4918, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.08735728263855, + "rewards/margins": 0.8931238055229187, + "rewards/rejected": -3.9804811477661133, + "step": 1283 + }, + { + "epoch": 0.8385990693117805, + "grad_norm": 31.10069114339872, + "learning_rate": 1.1596677823462769e-08, + "logits/chosen": -1.4978846311569214, + "logits/rejected": -1.5116081237792969, + "logps/chosen": -835.6173095703125, + "logps/rejected": -892.9727783203125, + "loss": 0.4633, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4220669269561768, + "rewards/margins": 1.0128189325332642, + "rewards/rejected": -4.4348859786987305, + "step": 1284 + }, + { + "epoch": 0.839252183851743, + "grad_norm": 95.67340364123025, + "learning_rate": 1.1505440853655996e-08, + "logits/chosen": -1.580156922340393, + "logits/rejected": -1.564608097076416, + "logps/chosen": -828.8040771484375, + "logps/rejected": -858.2422485351562, + "loss": 0.5397, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2406132221221924, + "rewards/margins": 0.5403940677642822, + "rewards/rejected": -3.7810075283050537, + "step": 1285 + }, + { + "epoch": 0.8399052983917055, + "grad_norm": 20.610401240104963, + "learning_rate": 1.1414534381356817e-08, + "logits/chosen": -1.457862377166748, + "logits/rejected": -1.4158681631088257, + "logps/chosen": -854.0062255859375, + "logps/rejected": -980.3665771484375, + "loss": 0.5507, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.427743911743164, + "rewards/margins": 1.1576130390167236, + "rewards/rejected": -4.585356712341309, + "step": 1286 + }, + { + "epoch": 0.8405584129316679, + "grad_norm": 15.387698511738025, + "learning_rate": 1.132395887974536e-08, + "logits/chosen": -1.512953758239746, + "logits/rejected": -1.4666166305541992, + "logps/chosen": -873.6098022460938, + "logps/rejected": -931.2805786132812, + "loss": 0.5066, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.396209478378296, + "rewards/margins": 0.8550810813903809, + "rewards/rejected": -4.251290798187256, + "step": 1287 + }, + { + "epoch": 0.8412115274716303, + "grad_norm": 17.455285669432005, + "learning_rate": 1.123371482027895e-08, + "logits/chosen": -1.4253538846969604, + "logits/rejected": -1.4188988208770752, + "logps/chosen": -823.5361328125, + "logps/rejected": -855.1928100585938, + "loss": 0.4807, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0535998344421387, + "rewards/margins": 1.1148144006729126, + "rewards/rejected": -4.16841459274292, + "step": 1288 + }, + { + "epoch": 0.8418646420115928, + "grad_norm": 95.33465590024296, + "learning_rate": 1.1143802672689772e-08, + "logits/chosen": -1.5128263235092163, + "logits/rejected": -1.5237712860107422, + "logps/chosen": -804.3637084960938, + "logps/rejected": -936.9696044921875, + "loss": 0.4619, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.063973903656006, + "rewards/margins": 0.989101231098175, + "rewards/rejected": -4.053074836730957, + "step": 1289 + }, + { + "epoch": 0.8425177565515553, + "grad_norm": 18.893079688911747, + "learning_rate": 1.1054222904982346e-08, + "logits/chosen": -1.4855834245681763, + "logits/rejected": -1.4457110166549683, + "logps/chosen": -862.0806274414062, + "logps/rejected": -923.0704956054688, + "loss": 0.4819, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3044419288635254, + "rewards/margins": 1.0387816429138184, + "rewards/rejected": -4.343223571777344, + "step": 1290 + }, + { + "epoch": 0.8431708710915177, + "grad_norm": 36.3373091887223, + "learning_rate": 1.0964975983431116e-08, + "logits/chosen": -1.5048320293426514, + "logits/rejected": -1.5244179964065552, + "logps/chosen": -848.87548828125, + "logps/rejected": -917.09619140625, + "loss": 0.4807, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.414567708969116, + "rewards/margins": 0.6657711267471313, + "rewards/rejected": -4.080338954925537, + "step": 1291 + }, + { + "epoch": 0.8438239856314801, + "grad_norm": 41.15768005703854, + "learning_rate": 1.0876062372578e-08, + "logits/chosen": -1.4884114265441895, + "logits/rejected": -1.495098352432251, + "logps/chosen": -854.0869750976562, + "logps/rejected": -878.1396484375, + "loss": 0.5553, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5928542613983154, + "rewards/margins": 0.4925929307937622, + "rewards/rejected": -4.085447311401367, + "step": 1292 + }, + { + "epoch": 0.8444771001714426, + "grad_norm": 36.42635614805045, + "learning_rate": 1.0787482535230022e-08, + "logits/chosen": -1.4687424898147583, + "logits/rejected": -1.4143062829971313, + "logps/chosen": -813.6422729492188, + "logps/rejected": -855.9901123046875, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0875415802001953, + "rewards/margins": 0.5753628015518188, + "rewards/rejected": -3.6629042625427246, + "step": 1293 + }, + { + "epoch": 0.845130214711405, + "grad_norm": 29.828612800376433, + "learning_rate": 1.0699236932456835e-08, + "logits/chosen": -1.6260933876037598, + "logits/rejected": -1.5973844528198242, + "logps/chosen": -855.070068359375, + "logps/rejected": -896.7317504882812, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.386779546737671, + "rewards/margins": 0.6484910249710083, + "rewards/rejected": -4.035270690917969, + "step": 1294 + }, + { + "epoch": 0.8457833292513675, + "grad_norm": 42.20579198570261, + "learning_rate": 1.0611326023588388e-08, + "logits/chosen": -1.4956504106521606, + "logits/rejected": -1.4156677722930908, + "logps/chosen": -786.1782836914062, + "logps/rejected": -816.1480102539062, + "loss": 0.563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0679595470428467, + "rewards/margins": 0.5685796141624451, + "rewards/rejected": -3.6365387439727783, + "step": 1295 + }, + { + "epoch": 0.8464364437913299, + "grad_norm": 13.728489619296084, + "learning_rate": 1.0523750266212483e-08, + "logits/chosen": -1.5465428829193115, + "logits/rejected": -1.500390887260437, + "logps/chosen": -882.738525390625, + "logps/rejected": -909.25634765625, + "loss": 0.5011, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.744199752807617, + "rewards/margins": 0.5797795653343201, + "rewards/rejected": -4.323978900909424, + "step": 1296 + }, + { + "epoch": 0.8470895583312924, + "grad_norm": 16.665414724980934, + "learning_rate": 1.0436510116172425e-08, + "logits/chosen": -1.4301414489746094, + "logits/rejected": -1.4165202379226685, + "logps/chosen": -852.8707275390625, + "logps/rejected": -919.6453857421875, + "loss": 0.5169, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.4321398735046387, + "rewards/margins": 0.7153012156486511, + "rewards/rejected": -4.1474409103393555, + "step": 1297 + }, + { + "epoch": 0.8477426728712548, + "grad_norm": 51.89482440292027, + "learning_rate": 1.0349606027564633e-08, + "logits/chosen": -1.4815446138381958, + "logits/rejected": -1.4437528848648071, + "logps/chosen": -808.208984375, + "logps/rejected": -885.4605712890625, + "loss": 0.4939, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.99233078956604, + "rewards/margins": 0.8016055822372437, + "rewards/rejected": -3.7939364910125732, + "step": 1298 + }, + { + "epoch": 0.8483957874112172, + "grad_norm": 47.61880788832745, + "learning_rate": 1.0263038452736292e-08, + "logits/chosen": -1.4789763689041138, + "logits/rejected": -1.4989715814590454, + "logps/chosen": -751.89453125, + "logps/rejected": -971.8363647460938, + "loss": 0.4495, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0481152534484863, + "rewards/margins": 1.4892245531082153, + "rewards/rejected": -4.53734016418457, + "step": 1299 + }, + { + "epoch": 0.8490489019511797, + "grad_norm": 12.835741715040568, + "learning_rate": 1.0176807842282977e-08, + "logits/chosen": -1.4237325191497803, + "logits/rejected": -1.4008545875549316, + "logps/chosen": -822.1392211914062, + "logps/rejected": -825.3770751953125, + "loss": 0.5182, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1958327293395996, + "rewards/margins": 0.45696282386779785, + "rewards/rejected": -3.6527957916259766, + "step": 1300 + }, + { + "epoch": 0.8490489019511797, + "eval_logits/chosen": -1.4787728786468506, + "eval_logits/rejected": -1.4600926637649536, + "eval_logps/chosen": -842.8944702148438, + "eval_logps/rejected": -926.5396118164062, + "eval_loss": 0.49515822529792786, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -3.3254919052124023, + "eval_rewards/margins": 0.9175096750259399, + "eval_rewards/rejected": -4.243001461029053, + "eval_runtime": 296.4556, + "eval_samples_per_second": 13.493, + "eval_steps_per_second": 0.843, + "step": 1300 + }, + { + "epoch": 0.8497020164911422, + "grad_norm": 66.58487284404727, + "learning_rate": 1.009091464504633e-08, + "logits/chosen": -1.4258389472961426, + "logits/rejected": -1.3811410665512085, + "logps/chosen": -866.520751953125, + "logps/rejected": -932.500244140625, + "loss": 0.5058, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8197102546691895, + "rewards/margins": 0.9209631085395813, + "rewards/rejected": -4.740673542022705, + "step": 1301 + }, + { + "epoch": 0.8503551310311046, + "grad_norm": 15.745580341420917, + "learning_rate": 1.0005359308111702e-08, + "logits/chosen": -1.535689353942871, + "logits/rejected": -1.5087151527404785, + "logps/chosen": -806.9986572265625, + "logps/rejected": -910.8426513671875, + "loss": 0.4717, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.248060941696167, + "rewards/margins": 1.305046558380127, + "rewards/rejected": -4.553107738494873, + "step": 1302 + }, + { + "epoch": 0.851008245571067, + "grad_norm": 62.332572039035284, + "learning_rate": 9.920142276805852e-09, + "logits/chosen": -1.465049386024475, + "logits/rejected": -1.4151432514190674, + "logps/chosen": -927.4882202148438, + "logps/rejected": -980.1682739257812, + "loss": 0.5446, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.472818374633789, + "rewards/margins": 0.7706753611564636, + "rewards/rejected": -4.243494033813477, + "step": 1303 + }, + { + "epoch": 0.8516613601110294, + "grad_norm": 33.565372393934965, + "learning_rate": 9.835263994694587e-09, + "logits/chosen": -1.46824049949646, + "logits/rejected": -1.4706170558929443, + "logps/chosen": -960.3832397460938, + "logps/rejected": -1062.023681640625, + "loss": 0.4433, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9557857513427734, + "rewards/margins": 0.9899990558624268, + "rewards/rejected": -4.945785045623779, + "step": 1304 + }, + { + "epoch": 0.852314474650992, + "grad_norm": 25.81469581412769, + "learning_rate": 9.750724903580503e-09, + "logits/chosen": -1.5085800886154175, + "logits/rejected": -1.5068165063858032, + "logps/chosen": -845.734130859375, + "logps/rejected": -906.9862060546875, + "loss": 0.4746, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1564383506774902, + "rewards/margins": 0.7602788209915161, + "rewards/rejected": -3.916717052459717, + "step": 1305 + }, + { + "epoch": 0.8529675891909544, + "grad_norm": 17.54341908103785, + "learning_rate": 9.666525443500666e-09, + "logits/chosen": -1.5109413862228394, + "logits/rejected": -1.397308349609375, + "logps/chosen": -848.3529052734375, + "logps/rejected": -978.5413818359375, + "loss": 0.5116, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.651090145111084, + "rewards/margins": 1.1326894760131836, + "rewards/rejected": -4.783779144287109, + "step": 1306 + }, + { + "epoch": 0.8536207037309168, + "grad_norm": 17.25055458777116, + "learning_rate": 9.582666052724305e-09, + "logits/chosen": -1.5393823385238647, + "logits/rejected": -1.5182414054870605, + "logps/chosen": -882.0748901367188, + "logps/rejected": -946.6273803710938, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4100325107574463, + "rewards/margins": 0.5562119483947754, + "rewards/rejected": -3.9662444591522217, + "step": 1307 + }, + { + "epoch": 0.8542738182708792, + "grad_norm": 16.192143577334363, + "learning_rate": 9.499147167750541e-09, + "logits/chosen": -1.425965428352356, + "logits/rejected": -1.4162306785583496, + "logps/chosen": -811.1448364257812, + "logps/rejected": -926.71435546875, + "loss": 0.4845, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.059163808822632, + "rewards/margins": 0.9071324467658997, + "rewards/rejected": -3.9662961959838867, + "step": 1308 + }, + { + "epoch": 0.8549269328108418, + "grad_norm": 74.71761901774033, + "learning_rate": 9.415969223306133e-09, + "logits/chosen": -1.4104845523834229, + "logits/rejected": -1.4388418197631836, + "logps/chosen": -902.40380859375, + "logps/rejected": -968.486083984375, + "loss": 0.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.3599321842193604, + "rewards/margins": 0.6312100291252136, + "rewards/rejected": -3.9911422729492188, + "step": 1309 + }, + { + "epoch": 0.8555800473508042, + "grad_norm": 33.85232737585008, + "learning_rate": 9.333132652343193e-09, + "logits/chosen": -1.5950632095336914, + "logits/rejected": -1.5360954999923706, + "logps/chosen": -822.3123779296875, + "logps/rejected": -889.439697265625, + "loss": 0.4481, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.34323787689209, + "rewards/margins": 1.0119768381118774, + "rewards/rejected": -4.355214595794678, + "step": 1310 + }, + { + "epoch": 0.8562331618907666, + "grad_norm": 64.61699538785297, + "learning_rate": 9.250637886036913e-09, + "logits/chosen": -1.4856970310211182, + "logits/rejected": -1.4961824417114258, + "logps/chosen": -820.3836059570312, + "logps/rejected": -874.966796875, + "loss": 0.4821, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9959676265716553, + "rewards/margins": 0.6477092504501343, + "rewards/rejected": -3.6436767578125, + "step": 1311 + }, + { + "epoch": 0.856886276430729, + "grad_norm": 49.59060889243429, + "learning_rate": 9.16848535378339e-09, + "logits/chosen": -1.4910832643508911, + "logits/rejected": -1.4679546356201172, + "logps/chosen": -820.6103515625, + "logps/rejected": -934.1729125976562, + "loss": 0.4813, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2271547317504883, + "rewards/margins": 1.107942819595337, + "rewards/rejected": -4.335097312927246, + "step": 1312 + }, + { + "epoch": 0.8575393909706915, + "grad_norm": 67.84816938025705, + "learning_rate": 9.086675483197323e-09, + "logits/chosen": -1.4484561681747437, + "logits/rejected": -1.424304485321045, + "logps/chosen": -858.7589111328125, + "logps/rejected": -946.8106079101562, + "loss": 0.4282, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.125239849090576, + "rewards/margins": 0.7515519857406616, + "rewards/rejected": -3.876791477203369, + "step": 1313 + }, + { + "epoch": 0.858192505510654, + "grad_norm": 25.662991507947012, + "learning_rate": 9.005208700109817e-09, + "logits/chosen": -1.4877333641052246, + "logits/rejected": -1.4774301052093506, + "logps/chosen": -765.2407836914062, + "logps/rejected": -906.0118408203125, + "loss": 0.4055, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1628265380859375, + "rewards/margins": 1.3297454118728638, + "rewards/rejected": -4.492571830749512, + "step": 1314 + }, + { + "epoch": 0.8588456200506164, + "grad_norm": 127.58031473226492, + "learning_rate": 8.924085428566163e-09, + "logits/chosen": -1.4710652828216553, + "logits/rejected": -1.4731792211532593, + "logps/chosen": -753.9927978515625, + "logps/rejected": -920.06005859375, + "loss": 0.5066, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.049560070037842, + "rewards/margins": 0.948535680770874, + "rewards/rejected": -3.998095750808716, + "step": 1315 + }, + { + "epoch": 0.8594987345905788, + "grad_norm": 45.1109159351009, + "learning_rate": 8.843306090823632e-09, + "logits/chosen": -1.45258367061615, + "logits/rejected": -1.4658076763153076, + "logps/chosen": -828.3963012695312, + "logps/rejected": -949.1826171875, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.294034481048584, + "rewards/margins": 0.9175934791564941, + "rewards/rejected": -4.211627960205078, + "step": 1316 + }, + { + "epoch": 0.8601518491305413, + "grad_norm": 29.986901265876828, + "learning_rate": 8.762871107349267e-09, + "logits/chosen": -1.511345386505127, + "logits/rejected": -1.4521244764328003, + "logps/chosen": -778.9940795898438, + "logps/rejected": -893.1011962890625, + "loss": 0.5278, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.323246479034424, + "rewards/margins": 1.0293022394180298, + "rewards/rejected": -4.352548599243164, + "step": 1317 + }, + { + "epoch": 0.8608049636705037, + "grad_norm": 11.35851659183948, + "learning_rate": 8.682780896817716e-09, + "logits/chosen": -1.3951612710952759, + "logits/rejected": -1.4687902927398682, + "logps/chosen": -848.2562866210938, + "logps/rejected": -946.0987548828125, + "loss": 0.5036, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4302592277526855, + "rewards/margins": 0.6691716909408569, + "rewards/rejected": -4.099431037902832, + "step": 1318 + }, + { + "epoch": 0.8614580782104662, + "grad_norm": 11.967451850970171, + "learning_rate": 8.603035876109013e-09, + "logits/chosen": -1.5124762058258057, + "logits/rejected": -1.4754977226257324, + "logps/chosen": -814.0156860351562, + "logps/rejected": -796.652587890625, + "loss": 0.4813, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.291874647140503, + "rewards/margins": 0.5844862461090088, + "rewards/rejected": -3.8763608932495117, + "step": 1319 + }, + { + "epoch": 0.8621111927504286, + "grad_norm": 120.18322878609834, + "learning_rate": 8.523636460306463e-09, + "logits/chosen": -1.4285402297973633, + "logits/rejected": -1.4294164180755615, + "logps/chosen": -782.0220336914062, + "logps/rejected": -898.1871948242188, + "loss": 0.5255, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4391157627105713, + "rewards/margins": 0.8806977868080139, + "rewards/rejected": -4.3198137283325195, + "step": 1320 + }, + { + "epoch": 0.862764307290391, + "grad_norm": 48.61379884717116, + "learning_rate": 8.444583062694439e-09, + "logits/chosen": -1.3637633323669434, + "logits/rejected": -1.3878041505813599, + "logps/chosen": -757.3119506835938, + "logps/rejected": -860.0999145507812, + "loss": 0.5276, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0247325897216797, + "rewards/margins": 0.7837187051773071, + "rewards/rejected": -3.8084514141082764, + "step": 1321 + }, + { + "epoch": 0.8634174218303535, + "grad_norm": 11.712512451580766, + "learning_rate": 8.365876094756228e-09, + "logits/chosen": -1.5599374771118164, + "logits/rejected": -1.5587773323059082, + "logps/chosen": -911.1922607421875, + "logps/rejected": -981.4324340820312, + "loss": 0.5147, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.283015251159668, + "rewards/margins": 0.8019892573356628, + "rewards/rejected": -4.085004806518555, + "step": 1322 + }, + { + "epoch": 0.864070536370316, + "grad_norm": 131.68861126525354, + "learning_rate": 8.287515966171928e-09, + "logits/chosen": -1.5320029258728027, + "logits/rejected": -1.516080379486084, + "logps/chosen": -924.517822265625, + "logps/rejected": -953.4563598632812, + "loss": 0.5502, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6483347415924072, + "rewards/margins": 0.7624077796936035, + "rewards/rejected": -4.41074275970459, + "step": 1323 + }, + { + "epoch": 0.8647236509102784, + "grad_norm": 24.356458817864755, + "learning_rate": 8.209503084816285e-09, + "logits/chosen": -1.546379566192627, + "logits/rejected": -1.4577432870864868, + "logps/chosen": -800.8156127929688, + "logps/rejected": -880.213134765625, + "loss": 0.4574, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0747878551483154, + "rewards/margins": 1.0365558862686157, + "rewards/rejected": -4.1113433837890625, + "step": 1324 + }, + { + "epoch": 0.8653767654502408, + "grad_norm": 33.56562789954909, + "learning_rate": 8.131837856756585e-09, + "logits/chosen": -1.4879792928695679, + "logits/rejected": -1.4731531143188477, + "logps/chosen": -843.258056640625, + "logps/rejected": -948.2938232421875, + "loss": 0.5075, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.485196828842163, + "rewards/margins": 1.1023627519607544, + "rewards/rejected": -4.587559223175049, + "step": 1325 + }, + { + "epoch": 0.8660298799902033, + "grad_norm": 60.746333244964944, + "learning_rate": 8.054520686250512e-09, + "logits/chosen": -1.4594736099243164, + "logits/rejected": -1.4537848234176636, + "logps/chosen": -705.6865844726562, + "logps/rejected": -833.6557006835938, + "loss": 0.4751, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.902322769165039, + "rewards/margins": 0.7870765924453735, + "rewards/rejected": -3.689399242401123, + "step": 1326 + }, + { + "epoch": 0.8666829945301657, + "grad_norm": 27.41375334113206, + "learning_rate": 7.977551975744088e-09, + "logits/chosen": -1.5716170072555542, + "logits/rejected": -1.5631660223007202, + "logps/chosen": -939.8978271484375, + "logps/rejected": -984.3545532226562, + "loss": 0.5208, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.000885963439941, + "rewards/margins": 0.5963870286941528, + "rewards/rejected": -4.597273349761963, + "step": 1327 + }, + { + "epoch": 0.8673361090701281, + "grad_norm": 16.93864295624459, + "learning_rate": 7.900932125869545e-09, + "logits/chosen": -1.4590840339660645, + "logits/rejected": -1.4381649494171143, + "logps/chosen": -835.8486938476562, + "logps/rejected": -1007.2196655273438, + "loss": 0.5122, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.375314712524414, + "rewards/margins": 1.4108258485794067, + "rewards/rejected": -4.786140441894531, + "step": 1328 + }, + { + "epoch": 0.8679892236100906, + "grad_norm": 28.716003483649484, + "learning_rate": 7.824661535443247e-09, + "logits/chosen": -1.6202168464660645, + "logits/rejected": -1.5609275102615356, + "logps/chosen": -914.7964477539062, + "logps/rejected": -1054.38525390625, + "loss": 0.5092, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.545833110809326, + "rewards/margins": 1.0506269931793213, + "rewards/rejected": -4.596460342407227, + "step": 1329 + }, + { + "epoch": 0.8686423381500531, + "grad_norm": 41.01865127499047, + "learning_rate": 7.748740601463622e-09, + "logits/chosen": -1.48884117603302, + "logits/rejected": -1.5338010787963867, + "logps/chosen": -731.7664794921875, + "logps/rejected": -812.95458984375, + "loss": 0.4394, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1326169967651367, + "rewards/margins": 0.877723217010498, + "rewards/rejected": -4.010340213775635, + "step": 1330 + }, + { + "epoch": 0.8692954526900155, + "grad_norm": 64.11269025670589, + "learning_rate": 7.673169719109091e-09, + "logits/chosen": -1.4994860887527466, + "logits/rejected": -1.4728403091430664, + "logps/chosen": -873.9459228515625, + "logps/rejected": -884.364501953125, + "loss": 0.5138, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.492875337600708, + "rewards/margins": 0.4393343925476074, + "rewards/rejected": -3.9322092533111572, + "step": 1331 + }, + { + "epoch": 0.8699485672299779, + "grad_norm": 38.41559411716, + "learning_rate": 7.597949281736019e-09, + "logits/chosen": -1.5339343547821045, + "logits/rejected": -1.4958739280700684, + "logps/chosen": -808.7354125976562, + "logps/rejected": -932.888916015625, + "loss": 0.5561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.077152967453003, + "rewards/margins": 1.2154319286346436, + "rewards/rejected": -4.2925848960876465, + "step": 1332 + }, + { + "epoch": 0.8706016817699403, + "grad_norm": 12.205155892018915, + "learning_rate": 7.523079680876613e-09, + "logits/chosen": -1.416054368019104, + "logits/rejected": -1.377394676208496, + "logps/chosen": -824.678955078125, + "logps/rejected": -965.9581909179688, + "loss": 0.4893, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.192705154418945, + "rewards/margins": 0.9551449418067932, + "rewards/rejected": -5.147850036621094, + "step": 1333 + }, + { + "epoch": 0.8712547963099029, + "grad_norm": 57.99000568713857, + "learning_rate": 7.448561306236989e-09, + "logits/chosen": -1.4369691610336304, + "logits/rejected": -1.4728755950927734, + "logps/chosen": -964.21435546875, + "logps/rejected": -1086.426513671875, + "loss": 0.4682, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6197071075439453, + "rewards/margins": 1.294512391090393, + "rewards/rejected": -4.914219379425049, + "step": 1334 + }, + { + "epoch": 0.8719079108498653, + "grad_norm": 20.044699015378878, + "learning_rate": 7.374394545695062e-09, + "logits/chosen": -1.4569401741027832, + "logits/rejected": -1.4874026775360107, + "logps/chosen": -895.0442504882812, + "logps/rejected": -896.1160888671875, + "loss": 0.4917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.06990385055542, + "rewards/margins": 0.44757652282714844, + "rewards/rejected": -3.5174806118011475, + "step": 1335 + }, + { + "epoch": 0.8725610253898277, + "grad_norm": 37.40096323479501, + "learning_rate": 7.300579785298516e-09, + "logits/chosen": -1.5550789833068848, + "logits/rejected": -1.561784029006958, + "logps/chosen": -865.9757690429688, + "logps/rejected": -1000.7755126953125, + "loss": 0.461, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.3865833282470703, + "rewards/margins": 0.9827724695205688, + "rewards/rejected": -4.36935567855835, + "step": 1336 + }, + { + "epoch": 0.8732141399297901, + "grad_norm": 10.447772657085043, + "learning_rate": 7.227117409262912e-09, + "logits/chosen": -1.477044701576233, + "logits/rejected": -1.4742741584777832, + "logps/chosen": -804.1036987304688, + "logps/rejected": -945.7398681640625, + "loss": 0.4617, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0770020484924316, + "rewards/margins": 1.022882342338562, + "rewards/rejected": -4.099884033203125, + "step": 1337 + }, + { + "epoch": 0.8738672544697527, + "grad_norm": 16.999701561801512, + "learning_rate": 7.154007799969517e-09, + "logits/chosen": -1.5495110750198364, + "logits/rejected": -1.5202927589416504, + "logps/chosen": -835.4287109375, + "logps/rejected": -846.221435546875, + "loss": 0.4887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.041454315185547, + "rewards/margins": 0.6531744003295898, + "rewards/rejected": -3.6946287155151367, + "step": 1338 + }, + { + "epoch": 0.8745203690097151, + "grad_norm": 93.85006471838479, + "learning_rate": 7.081251337963442e-09, + "logits/chosen": -1.581192970275879, + "logits/rejected": -1.5974817276000977, + "logps/chosen": -931.1429443359375, + "logps/rejected": -1022.624755859375, + "loss": 0.4658, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4839534759521484, + "rewards/margins": 1.0064407587051392, + "rewards/rejected": -4.490394115447998, + "step": 1339 + }, + { + "epoch": 0.8751734835496775, + "grad_norm": 57.219568967053746, + "learning_rate": 7.008848401951622e-09, + "logits/chosen": -1.449042558670044, + "logits/rejected": -1.4050977230072021, + "logps/chosen": -781.509521484375, + "logps/rejected": -854.7266845703125, + "loss": 0.5102, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.965616226196289, + "rewards/margins": 0.8889732956886292, + "rewards/rejected": -3.8545899391174316, + "step": 1340 + }, + { + "epoch": 0.8758265980896399, + "grad_norm": 73.1771564404403, + "learning_rate": 6.9367993688008195e-09, + "logits/chosen": -1.4334369897842407, + "logits/rejected": -1.4349690675735474, + "logps/chosen": -838.9798583984375, + "logps/rejected": -894.887939453125, + "loss": 0.4708, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.614448070526123, + "rewards/margins": 0.6810629367828369, + "rewards/rejected": -4.295510768890381, + "step": 1341 + }, + { + "epoch": 0.8764797126296024, + "grad_norm": 88.67111576311824, + "learning_rate": 6.865104613535719e-09, + "logits/chosen": -1.5720555782318115, + "logits/rejected": -1.556181788444519, + "logps/chosen": -1001.600830078125, + "logps/rejected": -1061.5048828125, + "loss": 0.5376, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.814486503601074, + "rewards/margins": 0.7989178895950317, + "rewards/rejected": -4.613404273986816, + "step": 1342 + }, + { + "epoch": 0.8771328271695649, + "grad_norm": 107.66602305396016, + "learning_rate": 6.7937645093369076e-09, + "logits/chosen": -1.5547233819961548, + "logits/rejected": -1.5719074010849, + "logps/chosen": -969.04296875, + "logps/rejected": -1082.557373046875, + "loss": 0.4579, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.629171371459961, + "rewards/margins": 1.1535873413085938, + "rewards/rejected": -4.782759189605713, + "step": 1343 + }, + { + "epoch": 0.8777859417095273, + "grad_norm": 12.704339641522967, + "learning_rate": 6.722779427539007e-09, + "logits/chosen": -1.5757780075073242, + "logits/rejected": -1.5685677528381348, + "logps/chosen": -892.2096557617188, + "logps/rejected": -1070.7259521484375, + "loss": 0.5011, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5382652282714844, + "rewards/margins": 1.3167110681533813, + "rewards/rejected": -4.854976654052734, + "step": 1344 + }, + { + "epoch": 0.8784390562494897, + "grad_norm": 74.90456047914228, + "learning_rate": 6.6521497376286425e-09, + "logits/chosen": -1.4474490880966187, + "logits/rejected": -1.393038272857666, + "logps/chosen": -863.8945922851562, + "logps/rejected": -985.6024169921875, + "loss": 0.525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4975996017456055, + "rewards/margins": 1.0218359231948853, + "rewards/rejected": -4.519434928894043, + "step": 1345 + }, + { + "epoch": 0.8790921707894522, + "grad_norm": 16.34267655213826, + "learning_rate": 6.581875807242643e-09, + "logits/chosen": -1.4592047929763794, + "logits/rejected": -1.4397943019866943, + "logps/chosen": -822.5889892578125, + "logps/rejected": -898.7081909179688, + "loss": 0.4558, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.987348794937134, + "rewards/margins": 0.8248506784439087, + "rewards/rejected": -3.812199115753174, + "step": 1346 + }, + { + "epoch": 0.8797452853294147, + "grad_norm": 16.0782479464605, + "learning_rate": 6.51195800216601e-09, + "logits/chosen": -1.5585129261016846, + "logits/rejected": -1.5897541046142578, + "logps/chosen": -826.855224609375, + "logps/rejected": -982.5360717773438, + "loss": 0.4817, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.9745357036590576, + "rewards/margins": 0.9419158697128296, + "rewards/rejected": -3.9164514541625977, + "step": 1347 + }, + { + "epoch": 0.8803983998693771, + "grad_norm": 15.944369178445228, + "learning_rate": 6.442396686330104e-09, + "logits/chosen": -1.5734515190124512, + "logits/rejected": -1.5492303371429443, + "logps/chosen": -877.200927734375, + "logps/rejected": -935.7286987304688, + "loss": 0.4639, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.669853925704956, + "rewards/margins": 0.9167975187301636, + "rewards/rejected": -4.58665132522583, + "step": 1348 + }, + { + "epoch": 0.8810515144093395, + "grad_norm": 12.702778495003988, + "learning_rate": 6.373192221810694e-09, + "logits/chosen": -1.5214407444000244, + "logits/rejected": -1.5045905113220215, + "logps/chosen": -798.3970947265625, + "logps/rejected": -847.7827758789062, + "loss": 0.4624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3049206733703613, + "rewards/margins": 0.854720950126648, + "rewards/rejected": -4.159641742706299, + "step": 1349 + }, + { + "epoch": 0.881704628949302, + "grad_norm": 100.6971315662948, + "learning_rate": 6.304344968826094e-09, + "logits/chosen": -1.4444749355316162, + "logits/rejected": -1.461186170578003, + "logps/chosen": -823.1656494140625, + "logps/rejected": -854.8773193359375, + "loss": 0.4764, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2248125076293945, + "rewards/margins": 0.8384161591529846, + "rewards/rejected": -4.063228607177734, + "step": 1350 + }, + { + "epoch": 0.8823577434892644, + "grad_norm": 28.966983505349088, + "learning_rate": 6.235855285735289e-09, + "logits/chosen": -1.373712182044983, + "logits/rejected": -1.3527984619140625, + "logps/chosen": -833.9160766601562, + "logps/rejected": -975.32080078125, + "loss": 0.4452, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4645469188690186, + "rewards/margins": 1.1589038372039795, + "rewards/rejected": -4.623451232910156, + "step": 1351 + }, + { + "epoch": 0.8830108580292269, + "grad_norm": 11.232677637593673, + "learning_rate": 6.167723529036051e-09, + "logits/chosen": -1.4311108589172363, + "logits/rejected": -1.4204224348068237, + "logps/chosen": -745.5303955078125, + "logps/rejected": -826.6415405273438, + "loss": 0.556, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9358832836151123, + "rewards/margins": 0.5993090271949768, + "rewards/rejected": -3.5351924896240234, + "step": 1352 + }, + { + "epoch": 0.8836639725691893, + "grad_norm": 52.83410951930048, + "learning_rate": 6.099950053363109e-09, + "logits/chosen": -1.5700377225875854, + "logits/rejected": -1.5207599401474, + "logps/chosen": -897.0535888671875, + "logps/rejected": -950.018798828125, + "loss": 0.4828, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4924488067626953, + "rewards/margins": 0.8256914019584656, + "rewards/rejected": -4.318140506744385, + "step": 1353 + }, + { + "epoch": 0.8843170871091518, + "grad_norm": 48.439495319236386, + "learning_rate": 6.032535211486303e-09, + "logits/chosen": -1.5001033544540405, + "logits/rejected": -1.4997605085372925, + "logps/chosen": -765.0709838867188, + "logps/rejected": -830.47900390625, + "loss": 0.4423, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0950000286102295, + "rewards/margins": 0.8747947216033936, + "rewards/rejected": -3.969794273376465, + "step": 1354 + }, + { + "epoch": 0.8849702016491142, + "grad_norm": 56.98461717362285, + "learning_rate": 5.965479354308739e-09, + "logits/chosen": -1.4557162523269653, + "logits/rejected": -1.4349863529205322, + "logps/chosen": -910.533935546875, + "logps/rejected": -992.22998046875, + "loss": 0.5308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4078073501586914, + "rewards/margins": 0.8652389049530029, + "rewards/rejected": -4.273046016693115, + "step": 1355 + }, + { + "epoch": 0.8856233161890766, + "grad_norm": 24.54186584954266, + "learning_rate": 5.898782830864909e-09, + "logits/chosen": -1.51798415184021, + "logits/rejected": -1.564520001411438, + "logps/chosen": -840.780029296875, + "logps/rejected": -975.0224609375, + "loss": 0.4677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.289569139480591, + "rewards/margins": 1.0400049686431885, + "rewards/rejected": -4.329574108123779, + "step": 1356 + }, + { + "epoch": 0.886276430729039, + "grad_norm": 104.63221608787276, + "learning_rate": 5.832445988318996e-09, + "logits/chosen": -1.552783489227295, + "logits/rejected": -1.462197184562683, + "logps/chosen": -885.2568969726562, + "logps/rejected": -968.8541259765625, + "loss": 0.5129, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7370312213897705, + "rewards/margins": 1.4136296510696411, + "rewards/rejected": -5.150660514831543, + "step": 1357 + }, + { + "epoch": 0.8869295452690016, + "grad_norm": 45.185665578640275, + "learning_rate": 5.766469171962943e-09, + "logits/chosen": -1.5153913497924805, + "logits/rejected": -1.4714546203613281, + "logps/chosen": -927.0983276367188, + "logps/rejected": -1037.369140625, + "loss": 0.4428, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4304184913635254, + "rewards/margins": 0.805862307548523, + "rewards/rejected": -4.23628044128418, + "step": 1358 + }, + { + "epoch": 0.887582659808964, + "grad_norm": 79.59548680251643, + "learning_rate": 5.7008527252147525e-09, + "logits/chosen": -1.5614638328552246, + "logits/rejected": -1.5693732500076294, + "logps/chosen": -825.9854736328125, + "logps/rejected": -924.3582763671875, + "loss": 0.4688, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.293821096420288, + "rewards/margins": 1.0692007541656494, + "rewards/rejected": -4.363021373748779, + "step": 1359 + }, + { + "epoch": 0.8882357743489264, + "grad_norm": 17.890553860709666, + "learning_rate": 5.635596989616628e-09, + "logits/chosen": -1.3857258558273315, + "logits/rejected": -1.3223706483840942, + "logps/chosen": -842.4962768554688, + "logps/rejected": -924.7880859375, + "loss": 0.5206, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7687184810638428, + "rewards/margins": 0.9322965145111084, + "rewards/rejected": -4.701014518737793, + "step": 1360 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 86.3211811567686, + "learning_rate": 5.570702304833225e-09, + "logits/chosen": -1.4879322052001953, + "logits/rejected": -1.5010827779769897, + "logps/chosen": -828.9422607421875, + "logps/rejected": -911.2158203125, + "loss": 0.482, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.6344053745269775, + "rewards/margins": 0.5890753865242004, + "rewards/rejected": -4.223480701446533, + "step": 1361 + }, + { + "epoch": 0.8895420034288514, + "grad_norm": 44.45040636623173, + "learning_rate": 5.5061690086498995e-09, + "logits/chosen": -1.60628342628479, + "logits/rejected": -1.5504629611968994, + "logps/chosen": -918.0870971679688, + "logps/rejected": -938.4238891601562, + "loss": 0.5785, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.70293927192688, + "rewards/margins": 0.504538357257843, + "rewards/rejected": -4.207477569580078, + "step": 1362 + }, + { + "epoch": 0.8901951179688138, + "grad_norm": 59.14938077433194, + "learning_rate": 5.441997436970908e-09, + "logits/chosen": -1.5273542404174805, + "logits/rejected": -1.4696133136749268, + "logps/chosen": -905.05615234375, + "logps/rejected": -1024.7708740234375, + "loss": 0.5164, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6890876293182373, + "rewards/margins": 0.9259529709815979, + "rewards/rejected": -4.6150407791137695, + "step": 1363 + }, + { + "epoch": 0.8908482325087762, + "grad_norm": 20.847094913779035, + "learning_rate": 5.3781879238177175e-09, + "logits/chosen": -1.4799872636795044, + "logits/rejected": -1.4763946533203125, + "logps/chosen": -818.2520751953125, + "logps/rejected": -900.87353515625, + "loss": 0.5318, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.162236213684082, + "rewards/margins": 0.6138532757759094, + "rewards/rejected": -3.776089668273926, + "step": 1364 + }, + { + "epoch": 0.8915013470487386, + "grad_norm": 42.086051676513534, + "learning_rate": 5.314740801327189e-09, + "logits/chosen": -1.5711402893066406, + "logits/rejected": -1.5064854621887207, + "logps/chosen": -800.728515625, + "logps/rejected": -949.0936889648438, + "loss": 0.4662, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.1598963737487793, + "rewards/margins": 1.2613778114318848, + "rewards/rejected": -4.421274662017822, + "step": 1365 + }, + { + "epoch": 0.8921544615887012, + "grad_norm": 36.43558502369589, + "learning_rate": 5.251656399749948e-09, + "logits/chosen": -1.5465190410614014, + "logits/rejected": -1.5165586471557617, + "logps/chosen": -931.0474853515625, + "logps/rejected": -1152.05126953125, + "loss": 0.4131, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.7468013763427734, + "rewards/margins": 1.6344844102859497, + "rewards/rejected": -5.381285667419434, + "step": 1366 + }, + { + "epoch": 0.8928075761286636, + "grad_norm": 18.023822032353266, + "learning_rate": 5.1889350474485425e-09, + "logits/chosen": -1.4559390544891357, + "logits/rejected": -1.4189265966415405, + "logps/chosen": -787.0001831054688, + "logps/rejected": -834.5303344726562, + "loss": 0.4982, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.818913698196411, + "rewards/margins": 0.7834856510162354, + "rewards/rejected": -3.6023998260498047, + "step": 1367 + }, + { + "epoch": 0.893460690668626, + "grad_norm": 51.00256674463102, + "learning_rate": 5.126577070895851e-09, + "logits/chosen": -1.4489341974258423, + "logits/rejected": -1.4130151271820068, + "logps/chosen": -933.874267578125, + "logps/rejected": -1004.5009765625, + "loss": 0.5661, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.800851345062256, + "rewards/margins": 1.1616989374160767, + "rewards/rejected": -4.962550163269043, + "step": 1368 + }, + { + "epoch": 0.8941138052085884, + "grad_norm": 28.042878667934062, + "learning_rate": 5.064582794673322e-09, + "logits/chosen": -1.5686874389648438, + "logits/rejected": -1.5284960269927979, + "logps/chosen": -913.930908203125, + "logps/rejected": -1030.9652099609375, + "loss": 0.5008, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.0387115478515625, + "rewards/margins": 0.8935470581054688, + "rewards/rejected": -4.9322590827941895, + "step": 1369 + }, + { + "epoch": 0.8947669197485509, + "grad_norm": 10.695407778573784, + "learning_rate": 5.002952541469296e-09, + "logits/chosen": -1.4896258115768433, + "logits/rejected": -1.4801172018051147, + "logps/chosen": -879.490966796875, + "logps/rejected": -959.708251953125, + "loss": 0.4915, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.568885564804077, + "rewards/margins": 0.942472517490387, + "rewards/rejected": -4.51135778427124, + "step": 1370 + }, + { + "epoch": 0.8954200342885134, + "grad_norm": 31.529008960795142, + "learning_rate": 4.941686632077316e-09, + "logits/chosen": -1.4891539812088013, + "logits/rejected": -1.4347100257873535, + "logps/chosen": -856.7740478515625, + "logps/rejected": -881.070068359375, + "loss": 0.4438, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.407636880874634, + "rewards/margins": 0.6649507284164429, + "rewards/rejected": -4.072587490081787, + "step": 1371 + }, + { + "epoch": 0.8960731488284758, + "grad_norm": 63.6933646160359, + "learning_rate": 4.880785385394481e-09, + "logits/chosen": -1.5834522247314453, + "logits/rejected": -1.5488102436065674, + "logps/chosen": -841.972900390625, + "logps/rejected": -880.1702880859375, + "loss": 0.457, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.852036714553833, + "rewards/margins": 1.1045963764190674, + "rewards/rejected": -3.956632614135742, + "step": 1372 + }, + { + "epoch": 0.8967262633684382, + "grad_norm": 69.78064834856058, + "learning_rate": 4.820249118419753e-09, + "logits/chosen": -1.4307173490524292, + "logits/rejected": -1.4592258930206299, + "logps/chosen": -751.7630615234375, + "logps/rejected": -990.3554077148438, + "loss": 0.4792, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6149396896362305, + "rewards/margins": 1.3603920936584473, + "rewards/rejected": -3.9753315448760986, + "step": 1373 + }, + { + "epoch": 0.8973793779084007, + "grad_norm": 10.250335849217539, + "learning_rate": 4.760078146252369e-09, + "logits/chosen": -1.5606201887130737, + "logits/rejected": -1.5576801300048828, + "logps/chosen": -913.7922973632812, + "logps/rejected": -964.5801391601562, + "loss": 0.4753, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.296699523925781, + "rewards/margins": 0.7379961013793945, + "rewards/rejected": -5.034695625305176, + "step": 1374 + }, + { + "epoch": 0.8980324924483631, + "grad_norm": 20.39345965692167, + "learning_rate": 4.7002727820901145e-09, + "logits/chosen": -1.45150887966156, + "logits/rejected": -1.4612077474594116, + "logps/chosen": -808.03466796875, + "logps/rejected": -928.4208984375, + "loss": 0.4667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.283160448074341, + "rewards/margins": 1.0168074369430542, + "rewards/rejected": -4.2999677658081055, + "step": 1375 + }, + { + "epoch": 0.8986856069883256, + "grad_norm": 10.800412600192082, + "learning_rate": 4.640833337227754e-09, + "logits/chosen": -1.5039968490600586, + "logits/rejected": -1.4786089658737183, + "logps/chosen": -886.8804931640625, + "logps/rejected": -959.9457397460938, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3790817260742188, + "rewards/margins": 1.2774152755737305, + "rewards/rejected": -4.656497001647949, + "step": 1376 + }, + { + "epoch": 0.899338721528288, + "grad_norm": 103.40843027693559, + "learning_rate": 4.581760121055392e-09, + "logits/chosen": -1.5100741386413574, + "logits/rejected": -1.435150146484375, + "logps/chosen": -883.2802124023438, + "logps/rejected": -883.0489501953125, + "loss": 0.54, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2741122245788574, + "rewards/margins": 0.6780447959899902, + "rewards/rejected": -3.9521570205688477, + "step": 1377 + }, + { + "epoch": 0.8999918360682505, + "grad_norm": 48.575124614734456, + "learning_rate": 4.523053441056876e-09, + "logits/chosen": -1.5189893245697021, + "logits/rejected": -1.5679491758346558, + "logps/chosen": -803.8499755859375, + "logps/rejected": -894.542236328125, + "loss": 0.4704, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.50295352935791, + "rewards/margins": 0.8048915266990662, + "rewards/rejected": -4.307845592498779, + "step": 1378 + }, + { + "epoch": 0.9006449506082129, + "grad_norm": 10.423098002326993, + "learning_rate": 4.4647136028081536e-09, + "logits/chosen": -1.5686522722244263, + "logits/rejected": -1.5366755723953247, + "logps/chosen": -841.1636352539062, + "logps/rejected": -963.812255859375, + "loss": 0.4655, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.380739688873291, + "rewards/margins": 1.0111637115478516, + "rewards/rejected": -4.391903400421143, + "step": 1379 + }, + { + "epoch": 0.9012980651481753, + "grad_norm": 103.35704441058242, + "learning_rate": 4.4067409099757505e-09, + "logits/chosen": -1.4515259265899658, + "logits/rejected": -1.421863317489624, + "logps/chosen": -816.9014282226562, + "logps/rejected": -830.080810546875, + "loss": 0.5235, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.6674067974090576, + "rewards/margins": 0.3336777687072754, + "rewards/rejected": -4.001084327697754, + "step": 1380 + }, + { + "epoch": 0.9019511796881378, + "grad_norm": 17.484965241800026, + "learning_rate": 4.349135664315137e-09, + "logits/chosen": -1.5792274475097656, + "logits/rejected": -1.5765178203582764, + "logps/chosen": -947.952392578125, + "logps/rejected": -1038.84326171875, + "loss": 0.4834, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7614831924438477, + "rewards/margins": 0.8391762971878052, + "rewards/rejected": -4.6006598472595215, + "step": 1381 + }, + { + "epoch": 0.9026042942281003, + "grad_norm": 29.173220260017832, + "learning_rate": 4.291898165669155e-09, + "logits/chosen": -1.5122828483581543, + "logits/rejected": -1.5348204374313354, + "logps/chosen": -920.7978515625, + "logps/rejected": -1017.3441162109375, + "loss": 0.446, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.638453960418701, + "rewards/margins": 0.9234171509742737, + "rewards/rejected": -4.56187105178833, + "step": 1382 + }, + { + "epoch": 0.9032574087680627, + "grad_norm": 20.850486826793972, + "learning_rate": 4.235028711966512e-09, + "logits/chosen": -1.5382293462753296, + "logits/rejected": -1.5173406600952148, + "logps/chosen": -957.9508056640625, + "logps/rejected": -1045.537353515625, + "loss": 0.5194, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8855299949645996, + "rewards/margins": 1.027876615524292, + "rewards/rejected": -4.9134063720703125, + "step": 1383 + }, + { + "epoch": 0.9039105233080251, + "grad_norm": 34.38670135334981, + "learning_rate": 4.178527599220164e-09, + "logits/chosen": -1.420579195022583, + "logits/rejected": -1.3815526962280273, + "logps/chosen": -813.62255859375, + "logps/rejected": -969.841552734375, + "loss": 0.5321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1666247844696045, + "rewards/margins": 1.1781651973724365, + "rewards/rejected": -4.344789981842041, + "step": 1384 + }, + { + "epoch": 0.9045636378479875, + "grad_norm": 68.615502539619, + "learning_rate": 4.122395121525807e-09, + "logits/chosen": -1.4724527597427368, + "logits/rejected": -1.4684042930603027, + "logps/chosen": -836.29638671875, + "logps/rejected": -971.5530395507812, + "loss": 0.5204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4262635707855225, + "rewards/margins": 0.8687800765037537, + "rewards/rejected": -4.2950439453125, + "step": 1385 + }, + { + "epoch": 0.9052167523879501, + "grad_norm": 50.58217070416385, + "learning_rate": 4.0666315710603585e-09, + "logits/chosen": -1.4584506750106812, + "logits/rejected": -1.438408613204956, + "logps/chosen": -815.32373046875, + "logps/rejected": -983.1445922851562, + "loss": 0.4398, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.262063503265381, + "rewards/margins": 1.1712214946746826, + "rewards/rejected": -4.433284759521484, + "step": 1386 + }, + { + "epoch": 0.9058698669279125, + "grad_norm": 37.43119604963389, + "learning_rate": 4.011237238080412e-09, + "logits/chosen": -1.47329843044281, + "logits/rejected": -1.5162948369979858, + "logps/chosen": -834.7694091796875, + "logps/rejected": -891.7135620117188, + "loss": 0.4209, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.565274238586426, + "rewards/margins": 0.6577186584472656, + "rewards/rejected": -4.222992897033691, + "step": 1387 + }, + { + "epoch": 0.9065229814678749, + "grad_norm": 20.4350578675936, + "learning_rate": 3.956212410920731e-09, + "logits/chosen": -1.5497958660125732, + "logits/rejected": -1.5459709167480469, + "logps/chosen": -957.7774047851562, + "logps/rejected": -1025.171142578125, + "loss": 0.4836, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.694035053253174, + "rewards/margins": 1.2108312845230103, + "rewards/rejected": -4.9048662185668945, + "step": 1388 + }, + { + "epoch": 0.9071760960078373, + "grad_norm": 124.44180200304133, + "learning_rate": 3.90155737599277e-09, + "logits/chosen": -1.532940149307251, + "logits/rejected": -1.521911859512329, + "logps/chosen": -958.362060546875, + "logps/rejected": -1038.1849365234375, + "loss": 0.4972, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.649043321609497, + "rewards/margins": 1.0646413564682007, + "rewards/rejected": -4.713685035705566, + "step": 1389 + }, + { + "epoch": 0.9078292105477999, + "grad_norm": 21.63423361819083, + "learning_rate": 3.847272417783129e-09, + "logits/chosen": -1.5962800979614258, + "logits/rejected": -1.5604323148727417, + "logps/chosen": -924.5335083007812, + "logps/rejected": -991.5548095703125, + "loss": 0.4995, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.826894760131836, + "rewards/margins": 0.9313570261001587, + "rewards/rejected": -4.758251190185547, + "step": 1390 + }, + { + "epoch": 0.9084823250877623, + "grad_norm": 33.36489378101839, + "learning_rate": 3.793357818852141e-09, + "logits/chosen": -1.5311145782470703, + "logits/rejected": -1.5007251501083374, + "logps/chosen": -857.7115478515625, + "logps/rejected": -936.7415161132812, + "loss": 0.5142, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.454068660736084, + "rewards/margins": 1.1231262683868408, + "rewards/rejected": -4.577195167541504, + "step": 1391 + }, + { + "epoch": 0.9091354396277247, + "grad_norm": 23.390479098008697, + "learning_rate": 3.739813859832383e-09, + "logits/chosen": -1.5484957695007324, + "logits/rejected": -1.5225454568862915, + "logps/chosen": -945.0482788085938, + "logps/rejected": -1007.5425415039062, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9478251934051514, + "rewards/margins": 0.5874274969100952, + "rewards/rejected": -4.535252571105957, + "step": 1392 + }, + { + "epoch": 0.9097885541676871, + "grad_norm": 27.975290638594313, + "learning_rate": 3.686640819427164e-09, + "logits/chosen": -1.4847687482833862, + "logits/rejected": -1.4688597917556763, + "logps/chosen": -885.1260986328125, + "logps/rejected": -946.1005859375, + "loss": 0.5318, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.731746196746826, + "rewards/margins": 0.6578457355499268, + "rewards/rejected": -4.389591693878174, + "step": 1393 + }, + { + "epoch": 0.9104416687076496, + "grad_norm": 11.396992947468927, + "learning_rate": 3.633838974409148e-09, + "logits/chosen": -1.4794695377349854, + "logits/rejected": -1.458986759185791, + "logps/chosen": -792.629150390625, + "logps/rejected": -829.4083251953125, + "loss": 0.5528, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2658491134643555, + "rewards/margins": 0.6557501554489136, + "rewards/rejected": -3.9215991497039795, + "step": 1394 + }, + { + "epoch": 0.9110947832476121, + "grad_norm": 51.767743124843925, + "learning_rate": 3.5814085996188516e-09, + "logits/chosen": -1.5730397701263428, + "logits/rejected": -1.5797572135925293, + "logps/chosen": -915.05859375, + "logps/rejected": -1105.7381591796875, + "loss": 0.4246, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.560077667236328, + "rewards/margins": 1.278112769126892, + "rewards/rejected": -4.83819055557251, + "step": 1395 + }, + { + "epoch": 0.9117478977875745, + "grad_norm": 45.396998730278426, + "learning_rate": 3.529349967963263e-09, + "logits/chosen": -1.4627869129180908, + "logits/rejected": -1.473419427871704, + "logps/chosen": -947.2510986328125, + "logps/rejected": -1030.341064453125, + "loss": 0.4942, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.280148506164551, + "rewards/margins": 0.8479899764060974, + "rewards/rejected": -4.128138065338135, + "step": 1396 + }, + { + "epoch": 0.9124010123275369, + "grad_norm": 15.614778720558112, + "learning_rate": 3.477663350414378e-09, + "logits/chosen": -1.5414352416992188, + "logits/rejected": -1.560785174369812, + "logps/chosen": -816.3881225585938, + "logps/rejected": -960.979736328125, + "loss": 0.4485, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.140190601348877, + "rewards/margins": 1.348671555519104, + "rewards/rejected": -4.488862037658691, + "step": 1397 + }, + { + "epoch": 0.9130541268674994, + "grad_norm": 73.82027329116843, + "learning_rate": 3.426349016007815e-09, + "logits/chosen": -1.4851999282836914, + "logits/rejected": -1.4966939687728882, + "logps/chosen": -769.3817749023438, + "logps/rejected": -939.4908447265625, + "loss": 0.4833, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9074413776397705, + "rewards/margins": 1.105202555656433, + "rewards/rejected": -4.012643337249756, + "step": 1398 + }, + { + "epoch": 0.9137072414074618, + "grad_norm": 42.06804571860955, + "learning_rate": 3.3754072318414346e-09, + "logits/chosen": -1.4935294389724731, + "logits/rejected": -1.4656096696853638, + "logps/chosen": -913.9616088867188, + "logps/rejected": -963.2996826171875, + "loss": 0.5134, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.082716464996338, + "rewards/margins": 1.030174970626831, + "rewards/rejected": -5.112891674041748, + "step": 1399 + }, + { + "epoch": 0.9143603559474243, + "grad_norm": 52.8878293286426, + "learning_rate": 3.3248382630738813e-09, + "logits/chosen": -1.5851317644119263, + "logits/rejected": -1.5376235246658325, + "logps/chosen": -955.0020141601562, + "logps/rejected": -968.5732421875, + "loss": 0.5165, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.052890777587891, + "rewards/margins": 0.612097442150116, + "rewards/rejected": -4.664988040924072, + "step": 1400 + }, + { + "epoch": 0.9143603559474243, + "eval_logits/chosen": -1.479893684387207, + "eval_logits/rejected": -1.4609832763671875, + "eval_logps/chosen": -843.42822265625, + "eval_logps/rejected": -927.4912719726562, + "eval_loss": 0.4942656457424164, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -3.330828905105591, + "eval_rewards/margins": 0.9216902256011963, + "eval_rewards/rejected": -4.252519130706787, + "eval_runtime": 300.2025, + "eval_samples_per_second": 13.324, + "eval_steps_per_second": 0.833, + "step": 1400 + }, + { + "epoch": 0.9150134704873867, + "grad_norm": 43.39561827227798, + "learning_rate": 3.2746423729232945e-09, + "logits/chosen": -1.4578529596328735, + "logits/rejected": -1.4696025848388672, + "logps/chosen": -826.8597412109375, + "logps/rejected": -900.2799682617188, + "loss": 0.4753, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.285310983657837, + "rewards/margins": 0.8875605463981628, + "rewards/rejected": -4.172871112823486, + "step": 1401 + }, + { + "epoch": 0.9156665850273492, + "grad_norm": 12.907015811194501, + "learning_rate": 3.224819822665842e-09, + "logits/chosen": -1.4285743236541748, + "logits/rejected": -1.4410088062286377, + "logps/chosen": -723.3681640625, + "logps/rejected": -917.68701171875, + "loss": 0.4513, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.843869209289551, + "rewards/margins": 1.6680275201797485, + "rewards/rejected": -4.511897087097168, + "step": 1402 + }, + { + "epoch": 0.9163196995673116, + "grad_norm": 14.776408723830784, + "learning_rate": 3.1753708716344364e-09, + "logits/chosen": -1.5420118570327759, + "logits/rejected": -1.4822347164154053, + "logps/chosen": -804.4373779296875, + "logps/rejected": -927.0358276367188, + "loss": 0.4767, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7062926292419434, + "rewards/margins": 0.9591672420501709, + "rewards/rejected": -4.665459632873535, + "step": 1403 + }, + { + "epoch": 0.916972814107274, + "grad_norm": 110.87083222991181, + "learning_rate": 3.1262957772173637e-09, + "logits/chosen": -1.4802658557891846, + "logits/rejected": -1.4908185005187988, + "logps/chosen": -890.1760864257812, + "logps/rejected": -956.2567138671875, + "loss": 0.4505, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3808512687683105, + "rewards/margins": 0.817065417766571, + "rewards/rejected": -4.1979169845581055, + "step": 1404 + }, + { + "epoch": 0.9176259286472365, + "grad_norm": 61.96498893829653, + "learning_rate": 3.0775947948569162e-09, + "logits/chosen": -1.4553033113479614, + "logits/rejected": -1.469690203666687, + "logps/chosen": -904.5153198242188, + "logps/rejected": -954.6373901367188, + "loss": 0.4687, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5189156532287598, + "rewards/margins": 0.6943820118904114, + "rewards/rejected": -4.2132978439331055, + "step": 1405 + }, + { + "epoch": 0.918279043187199, + "grad_norm": 67.39643533256593, + "learning_rate": 3.0292681780481027e-09, + "logits/chosen": -1.587602972984314, + "logits/rejected": -1.5586885213851929, + "logps/chosen": -860.71923828125, + "logps/rejected": -963.667724609375, + "loss": 0.4579, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3475418090820312, + "rewards/margins": 0.9892367124557495, + "rewards/rejected": -4.336778163909912, + "step": 1406 + }, + { + "epoch": 0.9189321577271614, + "grad_norm": 21.82848056849987, + "learning_rate": 2.981316178337298e-09, + "logits/chosen": -1.4134702682495117, + "logits/rejected": -1.4143006801605225, + "logps/chosen": -781.74853515625, + "logps/rejected": -905.278564453125, + "loss": 0.4413, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.0943939685821533, + "rewards/margins": 1.127016305923462, + "rewards/rejected": -4.221409797668457, + "step": 1407 + }, + { + "epoch": 0.9195852722671238, + "grad_norm": 50.76090513680746, + "learning_rate": 2.933739045320946e-09, + "logits/chosen": -1.5723644495010376, + "logits/rejected": -1.4760441780090332, + "logps/chosen": -810.4500122070312, + "logps/rejected": -890.5816040039062, + "loss": 0.4265, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.274219512939453, + "rewards/margins": 1.089596152305603, + "rewards/rejected": -4.363815784454346, + "step": 1408 + }, + { + "epoch": 0.9202383868070863, + "grad_norm": 35.65848530691593, + "learning_rate": 2.886537026644259e-09, + "logits/chosen": -1.4684292078018188, + "logits/rejected": -1.492079257965088, + "logps/chosen": -921.974853515625, + "logps/rejected": -1068.1021728515625, + "loss": 0.4578, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9635848999023438, + "rewards/margins": 1.2408589124679565, + "rewards/rejected": -5.20444393157959, + "step": 1409 + }, + { + "epoch": 0.9208915013470488, + "grad_norm": 13.02579519124362, + "learning_rate": 2.8397103679999535e-09, + "logits/chosen": -1.4359222650527954, + "logits/rejected": -1.4592533111572266, + "logps/chosen": -927.1942749023438, + "logps/rejected": -1130.98779296875, + "loss": 0.5307, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3110556602478027, + "rewards/margins": 1.212038278579712, + "rewards/rejected": -4.5230937004089355, + "step": 1410 + }, + { + "epoch": 0.9215446158870112, + "grad_norm": 71.1836961214141, + "learning_rate": 2.7932593131269085e-09, + "logits/chosen": -1.506251573562622, + "logits/rejected": -1.4403364658355713, + "logps/chosen": -857.0916748046875, + "logps/rejected": -886.8201293945312, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.356765031814575, + "rewards/margins": 0.6197909712791443, + "rewards/rejected": -3.9765563011169434, + "step": 1411 + }, + { + "epoch": 0.9221977304269736, + "grad_norm": 13.552404267546486, + "learning_rate": 2.747184103808975e-09, + "logits/chosen": -1.531203269958496, + "logits/rejected": -1.497283935546875, + "logps/chosen": -899.0496215820312, + "logps/rejected": -952.4412841796875, + "loss": 0.4867, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.493135452270508, + "rewards/margins": 0.9362788796424866, + "rewards/rejected": -4.429414749145508, + "step": 1412 + }, + { + "epoch": 0.922850844966936, + "grad_norm": 83.47511406736928, + "learning_rate": 2.7014849798736526e-09, + "logits/chosen": -1.4521548748016357, + "logits/rejected": -1.4682101011276245, + "logps/chosen": -852.2609252929688, + "logps/rejected": -949.7713012695312, + "loss": 0.4316, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3543503284454346, + "rewards/margins": 1.411728858947754, + "rewards/rejected": -4.766078948974609, + "step": 1413 + }, + { + "epoch": 0.9235039595068986, + "grad_norm": 20.341630193627967, + "learning_rate": 2.6561621791908654e-09, + "logits/chosen": -1.53240168094635, + "logits/rejected": -1.5243420600891113, + "logps/chosen": -909.6138916015625, + "logps/rejected": -951.72900390625, + "loss": 0.4342, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3367176055908203, + "rewards/margins": 0.8963368535041809, + "rewards/rejected": -4.233054161071777, + "step": 1414 + }, + { + "epoch": 0.924157074046861, + "grad_norm": 41.165407344672616, + "learning_rate": 2.6112159376717456e-09, + "logits/chosen": -1.4888275861740112, + "logits/rejected": -1.4489132165908813, + "logps/chosen": -770.2021484375, + "logps/rejected": -767.581787109375, + "loss": 0.5119, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1834659576416016, + "rewards/margins": 0.5162588357925415, + "rewards/rejected": -3.6997246742248535, + "step": 1415 + }, + { + "epoch": 0.9248101885868234, + "grad_norm": 49.5990277647984, + "learning_rate": 2.5666464892673768e-09, + "logits/chosen": -1.4363664388656616, + "logits/rejected": -1.4629590511322021, + "logps/chosen": -779.8847045898438, + "logps/rejected": -841.2040405273438, + "loss": 0.4964, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.621016502380371, + "rewards/margins": 0.6873450875282288, + "rewards/rejected": -3.308361530303955, + "step": 1416 + }, + { + "epoch": 0.9254633031267858, + "grad_norm": 80.94852955816721, + "learning_rate": 2.5224540659675692e-09, + "logits/chosen": -1.5635671615600586, + "logits/rejected": -1.5525977611541748, + "logps/chosen": -759.9940185546875, + "logps/rejected": -936.1673583984375, + "loss": 0.4763, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.061068058013916, + "rewards/margins": 1.1235350370407104, + "rewards/rejected": -4.184603214263916, + "step": 1417 + }, + { + "epoch": 0.9261164176667483, + "grad_norm": 42.64229099155968, + "learning_rate": 2.4786388977997034e-09, + "logits/chosen": -1.5770373344421387, + "logits/rejected": -1.4894176721572876, + "logps/chosen": -840.8875122070312, + "logps/rejected": -915.895263671875, + "loss": 0.5005, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.297794818878174, + "rewards/margins": 0.8870930671691895, + "rewards/rejected": -4.184887886047363, + "step": 1418 + }, + { + "epoch": 0.9267695322067108, + "grad_norm": 95.47316675894163, + "learning_rate": 2.435201212827456e-09, + "logits/chosen": -1.5091971158981323, + "logits/rejected": -1.4532575607299805, + "logps/chosen": -874.2222900390625, + "logps/rejected": -985.8672485351562, + "loss": 0.4799, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.3658037185668945, + "rewards/margins": 0.7014881372451782, + "rewards/rejected": -4.067291736602783, + "step": 1419 + }, + { + "epoch": 0.9274226467466732, + "grad_norm": 22.60308983441237, + "learning_rate": 2.3921412371496834e-09, + "logits/chosen": -1.481071949005127, + "logits/rejected": -1.403544545173645, + "logps/chosen": -835.0723266601562, + "logps/rejected": -833.1524047851562, + "loss": 0.5539, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.328864812850952, + "rewards/margins": 0.49991360306739807, + "rewards/rejected": -3.8287787437438965, + "step": 1420 + }, + { + "epoch": 0.9280757612866356, + "grad_norm": 22.71188518146946, + "learning_rate": 2.349459194899198e-09, + "logits/chosen": -1.4905717372894287, + "logits/rejected": -1.4012264013290405, + "logps/chosen": -864.1593017578125, + "logps/rejected": -974.7191162109375, + "loss": 0.4975, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.465050220489502, + "rewards/margins": 0.9912995100021362, + "rewards/rejected": -4.4563493728637695, + "step": 1421 + }, + { + "epoch": 0.9287288758265981, + "grad_norm": 38.38087407026408, + "learning_rate": 2.307155308241643e-09, + "logits/chosen": -1.51276695728302, + "logits/rejected": -1.5155949592590332, + "logps/chosen": -840.83056640625, + "logps/rejected": -936.4772338867188, + "loss": 0.4841, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4179623126983643, + "rewards/margins": 1.073073387145996, + "rewards/rejected": -4.4910359382629395, + "step": 1422 + }, + { + "epoch": 0.9293819903665606, + "grad_norm": 31.60230447821753, + "learning_rate": 2.2652297973742963e-09, + "logits/chosen": -1.5305612087249756, + "logits/rejected": -1.509539008140564, + "logps/chosen": -770.224609375, + "logps/rejected": -883.810302734375, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.264803409576416, + "rewards/margins": 1.1701805591583252, + "rewards/rejected": -4.434983730316162, + "step": 1423 + }, + { + "epoch": 0.930035104906523, + "grad_norm": 77.90811255353113, + "learning_rate": 2.2236828805249184e-09, + "logits/chosen": -1.5154668092727661, + "logits/rejected": -1.516597867012024, + "logps/chosen": -771.7640380859375, + "logps/rejected": -854.826904296875, + "loss": 0.5166, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9361448287963867, + "rewards/margins": 0.7485728859901428, + "rewards/rejected": -3.6847176551818848, + "step": 1424 + }, + { + "epoch": 0.9306882194464854, + "grad_norm": 37.785751655050305, + "learning_rate": 2.1825147739506805e-09, + "logits/chosen": -1.4908900260925293, + "logits/rejected": -1.5062459707260132, + "logps/chosen": -884.8120727539062, + "logps/rejected": -929.1033935546875, + "loss": 0.562, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.515050172805786, + "rewards/margins": 0.5184265375137329, + "rewards/rejected": -4.03347635269165, + "step": 1425 + }, + { + "epoch": 0.9313413339864479, + "grad_norm": 15.861485447329448, + "learning_rate": 2.141725691936963e-09, + "logits/chosen": -1.4910266399383545, + "logits/rejected": -1.4862860441207886, + "logps/chosen": -826.7039794921875, + "logps/rejected": -980.1998291015625, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.137228012084961, + "rewards/margins": 1.146528720855713, + "rewards/rejected": -4.283757209777832, + "step": 1426 + }, + { + "epoch": 0.9319944485264103, + "grad_norm": 28.488624408307338, + "learning_rate": 2.1013158467963004e-09, + "logits/chosen": -1.6420342922210693, + "logits/rejected": -1.6163609027862549, + "logps/chosen": -924.2805786132812, + "logps/rejected": -944.080810546875, + "loss": 0.5047, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.657701015472412, + "rewards/margins": 0.6087216138839722, + "rewards/rejected": -4.266422748565674, + "step": 1427 + }, + { + "epoch": 0.9326475630663728, + "grad_norm": 10.967689522386022, + "learning_rate": 2.0612854488672227e-09, + "logits/chosen": -1.4555834531784058, + "logits/rejected": -1.4464821815490723, + "logps/chosen": -761.0975952148438, + "logps/rejected": -887.48046875, + "loss": 0.448, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0129241943359375, + "rewards/margins": 1.1502059698104858, + "rewards/rejected": -4.163130760192871, + "step": 1428 + }, + { + "epoch": 0.9333006776063352, + "grad_norm": 14.555394411736843, + "learning_rate": 2.0216347065132144e-09, + "logits/chosen": -1.4503599405288696, + "logits/rejected": -1.418792724609375, + "logps/chosen": -837.8484497070312, + "logps/rejected": -937.6596069335938, + "loss": 0.5286, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.435154914855957, + "rewards/margins": 0.9483442902565002, + "rewards/rejected": -4.3834991455078125, + "step": 1429 + }, + { + "epoch": 0.9339537921462977, + "grad_norm": 65.87984391899451, + "learning_rate": 1.982363826121583e-09, + "logits/chosen": -1.5405863523483276, + "logits/rejected": -1.545666217803955, + "logps/chosen": -946.0952758789062, + "logps/rejected": -1030.0819091796875, + "loss": 0.477, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5466394424438477, + "rewards/margins": 0.8139285445213318, + "rewards/rejected": -4.360568046569824, + "step": 1430 + }, + { + "epoch": 0.9346069066862601, + "grad_norm": 18.205479221571412, + "learning_rate": 1.943473012102409e-09, + "logits/chosen": -1.531268835067749, + "logits/rejected": -1.5460495948791504, + "logps/chosen": -898.9242553710938, + "logps/rejected": -946.4170532226562, + "loss": 0.54, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.561772346496582, + "rewards/margins": 0.8415082693099976, + "rewards/rejected": -4.403279781341553, + "step": 1431 + }, + { + "epoch": 0.9352600212262225, + "grad_norm": 22.739713888659217, + "learning_rate": 1.9049624668874886e-09, + "logits/chosen": -1.4721064567565918, + "logits/rejected": -1.457444667816162, + "logps/chosen": -727.5782470703125, + "logps/rejected": -881.88720703125, + "loss": 0.432, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.979851007461548, + "rewards/margins": 1.0425056219100952, + "rewards/rejected": -4.0223565101623535, + "step": 1432 + }, + { + "epoch": 0.935913135766185, + "grad_norm": 23.34819005960656, + "learning_rate": 1.866832390929243e-09, + "logits/chosen": -1.4648447036743164, + "logits/rejected": -1.4732989072799683, + "logps/chosen": -815.2117309570312, + "logps/rejected": -898.7406005859375, + "loss": 0.541, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4008235931396484, + "rewards/margins": 0.6835126280784607, + "rewards/rejected": -4.084336280822754, + "step": 1433 + }, + { + "epoch": 0.9365662503061475, + "grad_norm": 63.46180789960193, + "learning_rate": 1.8290829826997367e-09, + "logits/chosen": -1.559678554534912, + "logits/rejected": -1.5437560081481934, + "logps/chosen": -902.6143188476562, + "logps/rejected": -933.3184204101562, + "loss": 0.5227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4241020679473877, + "rewards/margins": 0.513259768486023, + "rewards/rejected": -3.9373619556427, + "step": 1434 + }, + { + "epoch": 0.9372193648461099, + "grad_norm": 34.479724315346566, + "learning_rate": 1.7917144386895926e-09, + "logits/chosen": -1.5433356761932373, + "logits/rejected": -1.4930343627929688, + "logps/chosen": -839.4082641601562, + "logps/rejected": -884.831298828125, + "loss": 0.5361, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.436253547668457, + "rewards/margins": 0.7238208651542664, + "rewards/rejected": -4.160074234008789, + "step": 1435 + }, + { + "epoch": 0.9378724793860723, + "grad_norm": 15.898555518439984, + "learning_rate": 1.7547269534069626e-09, + "logits/chosen": -1.4363042116165161, + "logits/rejected": -1.426371455192566, + "logps/chosen": -834.6913452148438, + "logps/rejected": -999.5419921875, + "loss": 0.4245, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.229515552520752, + "rewards/margins": 1.1570442914962769, + "rewards/rejected": -4.386559963226318, + "step": 1436 + }, + { + "epoch": 0.9385255939260347, + "grad_norm": 24.923739035573195, + "learning_rate": 1.7181207193765756e-09, + "logits/chosen": -1.5305100679397583, + "logits/rejected": -1.5034910440444946, + "logps/chosen": -884.4599609375, + "logps/rejected": -919.95947265625, + "loss": 0.4775, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1505179405212402, + "rewards/margins": 0.9173092842102051, + "rewards/rejected": -4.067827224731445, + "step": 1437 + }, + { + "epoch": 0.9391787084659973, + "grad_norm": 22.943505162467492, + "learning_rate": 1.681895927138674e-09, + "logits/chosen": -1.532247543334961, + "logits/rejected": -1.5096689462661743, + "logps/chosen": -860.7628173828125, + "logps/rejected": -1032.169189453125, + "loss": 0.4488, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.4541678428649902, + "rewards/margins": 1.4435040950775146, + "rewards/rejected": -4.897672176361084, + "step": 1438 + }, + { + "epoch": 0.9398318230059597, + "grad_norm": 43.92276660865622, + "learning_rate": 1.646052765248046e-09, + "logits/chosen": -1.5007368326187134, + "logits/rejected": -1.539098858833313, + "logps/chosen": -893.9174194335938, + "logps/rejected": -938.9563598632812, + "loss": 0.5518, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.333798885345459, + "rewards/margins": 0.5584127902984619, + "rewards/rejected": -4.8922119140625, + "step": 1439 + }, + { + "epoch": 0.9404849375459221, + "grad_norm": 34.468585588337255, + "learning_rate": 1.6105914202730608e-09, + "logits/chosen": -1.4849194288253784, + "logits/rejected": -1.4576518535614014, + "logps/chosen": -796.8408813476562, + "logps/rejected": -966.2708129882812, + "loss": 0.4925, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.009169578552246, + "rewards/margins": 1.379059076309204, + "rewards/rejected": -4.388228893280029, + "step": 1440 + }, + { + "epoch": 0.9411380520858845, + "grad_norm": 40.61462323177837, + "learning_rate": 1.5755120767946607e-09, + "logits/chosen": -1.4398796558380127, + "logits/rejected": -1.3817028999328613, + "logps/chosen": -766.0177612304688, + "logps/rejected": -804.5883178710938, + "loss": 0.4724, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0304973125457764, + "rewards/margins": 0.6275922656059265, + "rewards/rejected": -3.6580896377563477, + "step": 1441 + }, + { + "epoch": 0.941791166625847, + "grad_norm": 10.887896926258218, + "learning_rate": 1.5408149174054446e-09, + "logits/chosen": -1.4392205476760864, + "logits/rejected": -1.4316151142120361, + "logps/chosen": -780.572509765625, + "logps/rejected": -929.8662109375, + "loss": 0.4883, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8742194175720215, + "rewards/margins": 1.0152424573898315, + "rewards/rejected": -3.8894619941711426, + "step": 1442 + }, + { + "epoch": 0.9424442811658095, + "grad_norm": 11.226257970745792, + "learning_rate": 1.506500122708662e-09, + "logits/chosen": -1.4671475887298584, + "logits/rejected": -1.4885404109954834, + "logps/chosen": -872.2683715820312, + "logps/rejected": -914.68115234375, + "loss": 0.5137, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3775594234466553, + "rewards/margins": 0.5714359283447266, + "rewards/rejected": -3.9489948749542236, + "step": 1443 + }, + { + "epoch": 0.9430973957057719, + "grad_norm": 75.39569486082955, + "learning_rate": 1.4725678713173207e-09, + "logits/chosen": -1.5088709592819214, + "logits/rejected": -1.502449870109558, + "logps/chosen": -910.4465942382812, + "logps/rejected": -961.0289306640625, + "loss": 0.527, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.666200637817383, + "rewards/margins": 0.5112284421920776, + "rewards/rejected": -4.177428722381592, + "step": 1444 + }, + { + "epoch": 0.9437505102457343, + "grad_norm": 12.36325740105416, + "learning_rate": 1.4390183398532457e-09, + "logits/chosen": -1.5747441053390503, + "logits/rejected": -1.5668046474456787, + "logps/chosen": -940.868408203125, + "logps/rejected": -1135.773681640625, + "loss": 0.4886, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.498914957046509, + "rewards/margins": 1.4520134925842285, + "rewards/rejected": -4.950928688049316, + "step": 1445 + }, + { + "epoch": 0.9444036247856968, + "grad_norm": 53.08556208724623, + "learning_rate": 1.405851702946148e-09, + "logits/chosen": -1.4660958051681519, + "logits/rejected": -1.4496102333068848, + "logps/chosen": -885.9743041992188, + "logps/rejected": -923.9781494140625, + "loss": 0.5484, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4384634494781494, + "rewards/margins": 0.7487789392471313, + "rewards/rejected": -4.18724250793457, + "step": 1446 + }, + { + "epoch": 0.9450567393256593, + "grad_norm": 10.230443690320527, + "learning_rate": 1.3730681332327242e-09, + "logits/chosen": -1.409921646118164, + "logits/rejected": -1.4098153114318848, + "logps/chosen": -854.5587158203125, + "logps/rejected": -979.5097045898438, + "loss": 0.4908, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.432528018951416, + "rewards/margins": 1.0839788913726807, + "rewards/rejected": -4.516506671905518, + "step": 1447 + }, + { + "epoch": 0.9457098538656217, + "grad_norm": 51.690528713979994, + "learning_rate": 1.3406678013557492e-09, + "logits/chosen": -1.4445513486862183, + "logits/rejected": -1.489593267440796, + "logps/chosen": -868.4315185546875, + "logps/rejected": -968.2102661132812, + "loss": 0.431, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0471696853637695, + "rewards/margins": 0.9745462536811829, + "rewards/rejected": -4.021716117858887, + "step": 1448 + }, + { + "epoch": 0.9463629684055841, + "grad_norm": 66.01296376492611, + "learning_rate": 1.3086508759631936e-09, + "logits/chosen": -1.609882116317749, + "logits/rejected": -1.605045199394226, + "logps/chosen": -855.3104248046875, + "logps/rejected": -889.3307495117188, + "loss": 0.543, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.2741992473602295, + "rewards/margins": 0.46837118268013, + "rewards/rejected": -3.742570161819458, + "step": 1449 + }, + { + "epoch": 0.9470160829455466, + "grad_norm": 69.83844230333594, + "learning_rate": 1.2770175237073661e-09, + "logits/chosen": -1.5078953504562378, + "logits/rejected": -1.4897831678390503, + "logps/chosen": -891.066650390625, + "logps/rejected": -890.9603271484375, + "loss": 0.5112, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.392768383026123, + "rewards/margins": 0.624529242515564, + "rewards/rejected": -4.017297267913818, + "step": 1450 + }, + { + "epoch": 0.947669197485509, + "grad_norm": 96.35598691320328, + "learning_rate": 1.2457679092440054e-09, + "logits/chosen": -1.5438551902770996, + "logits/rejected": -1.5170389413833618, + "logps/chosen": -747.1302490234375, + "logps/rejected": -902.2919311523438, + "loss": 0.4724, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.6579556465148926, + "rewards/margins": 1.488484501838684, + "rewards/rejected": -4.146440505981445, + "step": 1451 + }, + { + "epoch": 0.9483223120254715, + "grad_norm": 10.598167900496241, + "learning_rate": 1.2149021952314654e-09, + "logits/chosen": -1.587843418121338, + "logits/rejected": -1.5127121210098267, + "logps/chosen": -762.05615234375, + "logps/rejected": -796.598388671875, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.141162157058716, + "rewards/margins": 0.5582625865936279, + "rewards/rejected": -3.699424982070923, + "step": 1452 + }, + { + "epoch": 0.9489754265654339, + "grad_norm": 37.95658977591617, + "learning_rate": 1.1844205423298142e-09, + "logits/chosen": -1.5430374145507812, + "logits/rejected": -1.494556188583374, + "logps/chosen": -900.53173828125, + "logps/rejected": -893.8412475585938, + "loss": 0.4861, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4113199710845947, + "rewards/margins": 0.7247394323348999, + "rewards/rejected": -4.136059284210205, + "step": 1453 + }, + { + "epoch": 0.9496285411053964, + "grad_norm": 37.85534269742698, + "learning_rate": 1.15432310920007e-09, + "logits/chosen": -1.5332300662994385, + "logits/rejected": -1.5049644708633423, + "logps/chosen": -879.8773803710938, + "logps/rejected": -943.8563842773438, + "loss": 0.4722, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.507415771484375, + "rewards/margins": 0.8981890678405762, + "rewards/rejected": -4.405604839324951, + "step": 1454 + }, + { + "epoch": 0.9502816556453588, + "grad_norm": 13.249961940362768, + "learning_rate": 1.1246100525033165e-09, + "logits/chosen": -1.51246976852417, + "logits/rejected": -1.4882327318191528, + "logps/chosen": -900.8251342773438, + "logps/rejected": -959.567626953125, + "loss": 0.4925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6211113929748535, + "rewards/margins": 0.7765340805053711, + "rewards/rejected": -4.397645473480225, + "step": 1455 + }, + { + "epoch": 0.9509347701853212, + "grad_norm": 78.0554509757586, + "learning_rate": 1.0952815268999049e-09, + "logits/chosen": -1.4893440008163452, + "logits/rejected": -1.458596110343933, + "logps/chosen": -775.9324951171875, + "logps/rejected": -817.9571533203125, + "loss": 0.4805, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.140364646911621, + "rewards/margins": 0.7064178586006165, + "rewards/rejected": -3.846782684326172, + "step": 1456 + }, + { + "epoch": 0.9515878847252837, + "grad_norm": 112.04084469499804, + "learning_rate": 1.0663376850486628e-09, + "logits/chosen": -1.5067682266235352, + "logits/rejected": -1.438143253326416, + "logps/chosen": -828.5684814453125, + "logps/rejected": -927.2904052734375, + "loss": 0.5531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.495786666870117, + "rewards/margins": 0.701140820980072, + "rewards/rejected": -4.196927547454834, + "step": 1457 + }, + { + "epoch": 0.9522409992652462, + "grad_norm": 39.94282013087609, + "learning_rate": 1.0377786776060854e-09, + "logits/chosen": -1.5286598205566406, + "logits/rejected": -1.4984127283096313, + "logps/chosen": -771.9295654296875, + "logps/rejected": -820.18310546875, + "loss": 0.4538, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.990509033203125, + "rewards/margins": 1.0369949340820312, + "rewards/rejected": -4.027503967285156, + "step": 1458 + }, + { + "epoch": 0.9528941138052086, + "grad_norm": 93.88679149047655, + "learning_rate": 1.0096046532255376e-09, + "logits/chosen": -1.6264631748199463, + "logits/rejected": -1.5820178985595703, + "logps/chosen": -873.897705078125, + "logps/rejected": -970.2108154296875, + "loss": 0.5254, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5638842582702637, + "rewards/margins": 0.973100483417511, + "rewards/rejected": -4.536985397338867, + "step": 1459 + }, + { + "epoch": 0.953547228345171, + "grad_norm": 19.02700301018552, + "learning_rate": 9.818157585565284e-10, + "logits/chosen": -1.4352535009384155, + "logits/rejected": -1.3992193937301636, + "logps/chosen": -827.4281616210938, + "logps/rejected": -891.892822265625, + "loss": 0.49, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2847142219543457, + "rewards/margins": 0.696830153465271, + "rewards/rejected": -3.9815444946289062, + "step": 1460 + }, + { + "epoch": 0.9542003428851334, + "grad_norm": 12.308757033298024, + "learning_rate": 9.544121382438875e-10, + "logits/chosen": -1.5002676248550415, + "logits/rejected": -1.46719491481781, + "logps/chosen": -895.8308715820312, + "logps/rejected": -926.5982666015625, + "loss": 0.4543, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5023105144500732, + "rewards/margins": 0.7259926795959473, + "rewards/rejected": -4.2283034324646, + "step": 1461 + }, + { + "epoch": 0.954853457425096, + "grad_norm": 18.453922423435763, + "learning_rate": 9.273939349270565e-10, + "logits/chosen": -1.5010855197906494, + "logits/rejected": -1.3877067565917969, + "logps/chosen": -768.5657348632812, + "logps/rejected": -797.06396484375, + "loss": 0.5329, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9298336505889893, + "rewards/margins": 0.8176116943359375, + "rewards/rejected": -3.7474451065063477, + "step": 1462 + }, + { + "epoch": 0.9555065719650584, + "grad_norm": 51.301955382401765, + "learning_rate": 9.00761289239324e-10, + "logits/chosen": -1.5010892152786255, + "logits/rejected": -1.4880714416503906, + "logps/chosen": -826.286865234375, + "logps/rejected": -911.9449462890625, + "loss": 0.4457, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.053290843963623, + "rewards/margins": 0.8817964792251587, + "rewards/rejected": -3.9350876808166504, + "step": 1463 + }, + { + "epoch": 0.9561596865050208, + "grad_norm": 25.443543945201426, + "learning_rate": 8.74514339807117e-10, + "logits/chosen": -1.4686068296432495, + "logits/rejected": -1.4487457275390625, + "logps/chosen": -860.71484375, + "logps/rejected": -960.4424438476562, + "loss": 0.5242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8519248962402344, + "rewards/margins": 1.1036741733551025, + "rewards/rejected": -4.955598831176758, + "step": 1464 + }, + { + "epoch": 0.9568128010449832, + "grad_norm": 47.38366581435745, + "learning_rate": 8.48653223249235e-10, + "logits/chosen": -1.4724491834640503, + "logits/rejected": -1.4838087558746338, + "logps/chosen": -881.0520629882812, + "logps/rejected": -929.6651611328125, + "loss": 0.4467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.963942289352417, + "rewards/margins": 0.7111070156097412, + "rewards/rejected": -4.675049304962158, + "step": 1465 + }, + { + "epoch": 0.9574659155849458, + "grad_norm": 22.495822791256995, + "learning_rate": 8.23178074176184e-10, + "logits/chosen": -1.5290329456329346, + "logits/rejected": -1.5657262802124023, + "logps/chosen": -871.360107421875, + "logps/rejected": -918.6497802734375, + "loss": 0.4997, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4585037231445312, + "rewards/margins": 0.9077770113945007, + "rewards/rejected": -4.3662800788879395, + "step": 1466 + }, + { + "epoch": 0.9581190301249082, + "grad_norm": 56.19391082571053, + "learning_rate": 7.980890251894606e-10, + "logits/chosen": -1.5316890478134155, + "logits/rejected": -1.5544103384017944, + "logps/chosen": -880.1229858398438, + "logps/rejected": -1003.7587890625, + "loss": 0.4847, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.2340664863586426, + "rewards/margins": 1.333704948425293, + "rewards/rejected": -4.567770957946777, + "step": 1467 + }, + { + "epoch": 0.9587721446648706, + "grad_norm": 19.370534224099423, + "learning_rate": 7.733862068808522e-10, + "logits/chosen": -1.5931593179702759, + "logits/rejected": -1.595098614692688, + "logps/chosen": -917.0612182617188, + "logps/rejected": -948.163818359375, + "loss": 0.5525, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9437098503112793, + "rewards/margins": 0.40092504024505615, + "rewards/rejected": -4.344634532928467, + "step": 1468 + }, + { + "epoch": 0.959425259204833, + "grad_norm": 51.95592093037611, + "learning_rate": 7.490697478317709e-10, + "logits/chosen": -1.4602024555206299, + "logits/rejected": -1.472777247428894, + "logps/chosen": -886.6076049804688, + "logps/rejected": -977.1819458007812, + "loss": 0.4909, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.663419246673584, + "rewards/margins": 0.9881623983383179, + "rewards/rejected": -4.651581764221191, + "step": 1469 + }, + { + "epoch": 0.9600783737447955, + "grad_norm": 33.610490066674124, + "learning_rate": 7.251397746125709e-10, + "logits/chosen": -1.5568439960479736, + "logits/rejected": -1.548944354057312, + "logps/chosen": -813.5231323242188, + "logps/rejected": -983.3826904296875, + "loss": 0.5045, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4104156494140625, + "rewards/margins": 1.39304518699646, + "rewards/rejected": -4.803461074829102, + "step": 1470 + }, + { + "epoch": 0.960731488284758, + "grad_norm": 32.807228872430244, + "learning_rate": 7.01596411781899e-10, + "logits/chosen": -1.5096122026443481, + "logits/rejected": -1.489628791809082, + "logps/chosen": -898.908447265625, + "logps/rejected": -961.82958984375, + "loss": 0.4871, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5367379188537598, + "rewards/margins": 1.0929946899414062, + "rewards/rejected": -4.629732608795166, + "step": 1471 + }, + { + "epoch": 0.9613846028247204, + "grad_norm": 12.404429113630796, + "learning_rate": 6.784397818860532e-10, + "logits/chosen": -1.4223408699035645, + "logits/rejected": -1.4489169120788574, + "logps/chosen": -749.7882690429688, + "logps/rejected": -879.7198486328125, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0195603370666504, + "rewards/margins": 1.003127098083496, + "rewards/rejected": -4.022687911987305, + "step": 1472 + }, + { + "epoch": 0.9620377173646828, + "grad_norm": 25.259975174041397, + "learning_rate": 6.556700054583253e-10, + "logits/chosen": -1.4970064163208008, + "logits/rejected": -1.4991717338562012, + "logps/chosen": -767.5467529296875, + "logps/rejected": -903.8729248046875, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0890581607818604, + "rewards/margins": 0.8758949041366577, + "rewards/rejected": -3.9649529457092285, + "step": 1473 + }, + { + "epoch": 0.9626908319046453, + "grad_norm": 56.02069174460643, + "learning_rate": 6.332872010183843e-10, + "logits/chosen": -1.5272411108016968, + "logits/rejected": -1.53700590133667, + "logps/chosen": -874.013427734375, + "logps/rejected": -928.57373046875, + "loss": 0.4722, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1264383792877197, + "rewards/margins": 0.8865586519241333, + "rewards/rejected": -4.012997150421143, + "step": 1474 + }, + { + "epoch": 0.9633439464446077, + "grad_norm": 12.235784698068322, + "learning_rate": 6.112914850716771e-10, + "logits/chosen": -1.4167723655700684, + "logits/rejected": -1.4353446960449219, + "logps/chosen": -847.5712890625, + "logps/rejected": -927.8067626953125, + "loss": 0.4272, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4825477600097656, + "rewards/margins": 0.6743806004524231, + "rewards/rejected": -4.156928539276123, + "step": 1475 + }, + { + "epoch": 0.9639970609845702, + "grad_norm": 53.93753110085255, + "learning_rate": 5.896829721087709e-10, + "logits/chosen": -1.4274340867996216, + "logits/rejected": -1.419229507446289, + "logps/chosen": -805.4702758789062, + "logps/rejected": -839.0209350585938, + "loss": 0.4951, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2356109619140625, + "rewards/margins": 0.7077401280403137, + "rewards/rejected": -3.9433510303497314, + "step": 1476 + }, + { + "epoch": 0.9646501755245326, + "grad_norm": 14.406009332648742, + "learning_rate": 5.684617746048198e-10, + "logits/chosen": -1.5064961910247803, + "logits/rejected": -1.4822742938995361, + "logps/chosen": -826.1743774414062, + "logps/rejected": -864.1240234375, + "loss": 0.4387, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.075737476348877, + "rewards/margins": 0.7606046199798584, + "rewards/rejected": -3.8363420963287354, + "step": 1477 + }, + { + "epoch": 0.9653032900644951, + "grad_norm": 38.670095331691506, + "learning_rate": 5.476280030189406e-10, + "logits/chosen": -1.5506258010864258, + "logits/rejected": -1.5424368381500244, + "logps/chosen": -908.591064453125, + "logps/rejected": -1038.738525390625, + "loss": 0.5371, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.491032838821411, + "rewards/margins": 1.2838757038116455, + "rewards/rejected": -4.774909019470215, + "step": 1478 + }, + { + "epoch": 0.9659564046044575, + "grad_norm": 25.905815918474016, + "learning_rate": 5.271817657936467e-10, + "logits/chosen": -1.4018712043762207, + "logits/rejected": -1.3280322551727295, + "logps/chosen": -733.0325317382812, + "logps/rejected": -867.226806640625, + "loss": 0.4825, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2157974243164062, + "rewards/margins": 0.9778196811676025, + "rewards/rejected": -4.19361686706543, + "step": 1479 + }, + { + "epoch": 0.96660951914442, + "grad_norm": 21.34054547875047, + "learning_rate": 5.071231693542732e-10, + "logits/chosen": -1.5019912719726562, + "logits/rejected": -1.4946105480194092, + "logps/chosen": -867.0049438476562, + "logps/rejected": -912.6876220703125, + "loss": 0.4603, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.469493865966797, + "rewards/margins": 0.9318346977233887, + "rewards/rejected": -4.4013285636901855, + "step": 1480 + }, + { + "epoch": 0.9672626336843824, + "grad_norm": 53.27235117213442, + "learning_rate": 4.874523181084611e-10, + "logits/chosen": -1.520746111869812, + "logits/rejected": -1.5099103450775146, + "logps/chosen": -950.77880859375, + "logps/rejected": -1004.3994750976562, + "loss": 0.4787, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.89591908454895, + "rewards/margins": 0.5920171141624451, + "rewards/rejected": -4.487936019897461, + "step": 1481 + }, + { + "epoch": 0.9679157482243449, + "grad_norm": 28.111966813993863, + "learning_rate": 4.681693144455656e-10, + "logits/chosen": -1.5613229274749756, + "logits/rejected": -1.4852428436279297, + "logps/chosen": -881.259765625, + "logps/rejected": -977.8818969726562, + "loss": 0.5095, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.6972122192382812, + "rewards/margins": 0.9360721111297607, + "rewards/rejected": -4.633284568786621, + "step": 1482 + }, + { + "epoch": 0.9685688627643073, + "grad_norm": 54.78566289216356, + "learning_rate": 4.4927425873614867e-10, + "logits/chosen": -1.5294121503829956, + "logits/rejected": -1.4791772365570068, + "logps/chosen": -838.6341552734375, + "logps/rejected": -868.4017333984375, + "loss": 0.4892, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.21793794631958, + "rewards/margins": 0.8229467272758484, + "rewards/rejected": -4.040884971618652, + "step": 1483 + }, + { + "epoch": 0.9692219773042697, + "grad_norm": 18.897385067422782, + "learning_rate": 4.30767249331454e-10, + "logits/chosen": -1.5810683965682983, + "logits/rejected": -1.5569019317626953, + "logps/chosen": -889.7955322265625, + "logps/rejected": -911.5818481445312, + "loss": 0.504, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5015814304351807, + "rewards/margins": 0.5960092544555664, + "rewards/rejected": -4.097590923309326, + "step": 1484 + }, + { + "epoch": 0.9698750918442322, + "grad_norm": 11.548963014295419, + "learning_rate": 4.1264838256289126e-10, + "logits/chosen": -1.5425231456756592, + "logits/rejected": -1.5341880321502686, + "logps/chosen": -866.2848510742188, + "logps/rejected": -1056.494384765625, + "loss": 0.4792, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4112586975097656, + "rewards/margins": 1.2582924365997314, + "rewards/rejected": -4.669550895690918, + "step": 1485 + }, + { + "epoch": 0.9705282063841947, + "grad_norm": 45.7932753999572, + "learning_rate": 3.9491775274153595e-10, + "logits/chosen": -1.4852381944656372, + "logits/rejected": -1.4904857873916626, + "logps/chosen": -817.4542846679688, + "logps/rejected": -949.4779663085938, + "loss": 0.466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.714642286300659, + "rewards/margins": 0.9591977000236511, + "rewards/rejected": -4.673839569091797, + "step": 1486 + }, + { + "epoch": 0.9711813209241571, + "grad_norm": 21.73552603940438, + "learning_rate": 3.7757545215764686e-10, + "logits/chosen": -1.483793020248413, + "logits/rejected": -1.5184385776519775, + "logps/chosen": -764.6434936523438, + "logps/rejected": -1080.6500244140625, + "loss": 0.487, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.000844717025757, + "rewards/margins": 2.042339563369751, + "rewards/rejected": -5.043184280395508, + "step": 1487 + }, + { + "epoch": 0.9718344354641195, + "grad_norm": 30.8551493809766, + "learning_rate": 3.606215710801663e-10, + "logits/chosen": -1.5189553499221802, + "logits/rejected": -1.555790662765503, + "logps/chosen": -820.7545166015625, + "logps/rejected": -871.0463256835938, + "loss": 0.5234, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4103848934173584, + "rewards/margins": 0.8298892974853516, + "rewards/rejected": -4.240273475646973, + "step": 1488 + }, + { + "epoch": 0.9724875500040819, + "grad_norm": 47.632353902452365, + "learning_rate": 3.440561977562789e-10, + "logits/chosen": -1.4855304956436157, + "logits/rejected": -1.4552634954452515, + "logps/chosen": -765.5675048828125, + "logps/rejected": -875.1714477539062, + "loss": 0.4912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0855441093444824, + "rewards/margins": 0.9937740564346313, + "rewards/rejected": -4.079318046569824, + "step": 1489 + }, + { + "epoch": 0.9731406645440445, + "grad_norm": 59.60831943859033, + "learning_rate": 3.278794184109118e-10, + "logits/chosen": -1.5224008560180664, + "logits/rejected": -1.5374361276626587, + "logps/chosen": -921.4551391601562, + "logps/rejected": -966.5579223632812, + "loss": 0.4683, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.87973690032959, + "rewards/margins": 0.5896239280700684, + "rewards/rejected": -4.469360828399658, + "step": 1490 + }, + { + "epoch": 0.9737937790840069, + "grad_norm": 40.26002383284324, + "learning_rate": 3.1209131724633517e-10, + "logits/chosen": -1.5184483528137207, + "logits/rejected": -1.5036489963531494, + "logps/chosen": -814.7634887695312, + "logps/rejected": -889.21923828125, + "loss": 0.5193, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4046013355255127, + "rewards/margins": 0.740337610244751, + "rewards/rejected": -4.144938945770264, + "step": 1491 + }, + { + "epoch": 0.9744468936239693, + "grad_norm": 34.218594481949225, + "learning_rate": 2.9669197644168755e-10, + "logits/chosen": -1.5170872211456299, + "logits/rejected": -1.5203460454940796, + "logps/chosen": -835.5225219726562, + "logps/rejected": -900.4053955078125, + "loss": 0.4703, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.076322078704834, + "rewards/margins": 0.8840709328651428, + "rewards/rejected": -3.9603934288024902, + "step": 1492 + }, + { + "epoch": 0.9751000081639317, + "grad_norm": 10.70959520154608, + "learning_rate": 2.8168147615254265e-10, + "logits/chosen": -1.4827511310577393, + "logits/rejected": -1.463191270828247, + "logps/chosen": -825.8433837890625, + "logps/rejected": -895.0802612304688, + "loss": 0.433, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2821357250213623, + "rewards/margins": 0.6120182871818542, + "rewards/rejected": -3.8941545486450195, + "step": 1493 + }, + { + "epoch": 0.9757531227038942, + "grad_norm": 14.011776786991527, + "learning_rate": 2.6705989451054343e-10, + "logits/chosen": -1.3761827945709229, + "logits/rejected": -1.3667436838150024, + "logps/chosen": -856.0826416015625, + "logps/rejected": -964.3638305664062, + "loss": 0.527, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3973817825317383, + "rewards/margins": 0.8620023727416992, + "rewards/rejected": -4.2593841552734375, + "step": 1494 + }, + { + "epoch": 0.9764062372438567, + "grad_norm": 56.388774885579274, + "learning_rate": 2.528273076229187e-10, + "logits/chosen": -1.4972938299179077, + "logits/rejected": -1.4492802619934082, + "logps/chosen": -858.5281982421875, + "logps/rejected": -913.3388671875, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4028518199920654, + "rewards/margins": 0.6677621006965637, + "rewards/rejected": -4.070613861083984, + "step": 1495 + }, + { + "epoch": 0.9770593517838191, + "grad_norm": 14.034593283767544, + "learning_rate": 2.389837895721586e-10, + "logits/chosen": -1.5359089374542236, + "logits/rejected": -1.5055701732635498, + "logps/chosen": -792.88037109375, + "logps/rejected": -861.2803955078125, + "loss": 0.4586, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.479644775390625, + "rewards/margins": 0.4910427927970886, + "rewards/rejected": -3.9706876277923584, + "step": 1496 + }, + { + "epoch": 0.9777124663237815, + "grad_norm": 57.03647156257448, + "learning_rate": 2.255294124155982e-10, + "logits/chosen": -1.5697842836380005, + "logits/rejected": -1.507396936416626, + "logps/chosen": -896.1490478515625, + "logps/rejected": -974.0924682617188, + "loss": 0.5092, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4873480796813965, + "rewards/margins": 1.329317331314087, + "rewards/rejected": -4.8166656494140625, + "step": 1497 + }, + { + "epoch": 0.978365580863744, + "grad_norm": 46.79910545953451, + "learning_rate": 2.124642461850179e-10, + "logits/chosen": -1.5201869010925293, + "logits/rejected": -1.4976688623428345, + "logps/chosen": -853.4832153320312, + "logps/rejected": -928.146728515625, + "loss": 0.4592, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6100432872772217, + "rewards/margins": 0.6564720273017883, + "rewards/rejected": -4.266514778137207, + "step": 1498 + }, + { + "epoch": 0.9790186954037065, + "grad_norm": 23.03773550817736, + "learning_rate": 1.997883588863436e-10, + "logits/chosen": -1.5425734519958496, + "logits/rejected": -1.5184190273284912, + "logps/chosen": -913.212890625, + "logps/rejected": -1053.3978271484375, + "loss": 0.4411, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4705982208251953, + "rewards/margins": 0.9973933696746826, + "rewards/rejected": -4.467991352081299, + "step": 1499 + }, + { + "epoch": 0.9796718099436689, + "grad_norm": 81.36896110575177, + "learning_rate": 1.875018164992137e-10, + "logits/chosen": -1.534759283065796, + "logits/rejected": -1.4785943031311035, + "logps/chosen": -828.40234375, + "logps/rejected": -832.9805908203125, + "loss": 0.5192, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.459455728530884, + "rewards/margins": 0.5882536172866821, + "rewards/rejected": -4.0477094650268555, + "step": 1500 + }, + { + "epoch": 0.9796718099436689, + "eval_logits/chosen": -1.4782989025115967, + "eval_logits/rejected": -1.4591182470321655, + "eval_logps/chosen": -844.1143798828125, + "eval_logps/rejected": -928.2655029296875, + "eval_loss": 0.49420246481895447, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -3.33768892288208, + "eval_rewards/margins": 0.9225709438323975, + "eval_rewards/rejected": -4.260260105133057, + "eval_runtime": 296.4379, + "eval_samples_per_second": 13.494, + "eval_steps_per_second": 0.843, + "step": 1500 + }, + { + "epoch": 0.9803249244836313, + "grad_norm": 26.21653471442728, + "learning_rate": 1.7560468297669606e-10, + "logits/chosen": -1.5513707399368286, + "logits/rejected": -1.5569634437561035, + "logps/chosen": -814.3359375, + "logps/rejected": -854.103759765625, + "loss": 0.4669, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.062058925628662, + "rewards/margins": 0.5742703676223755, + "rewards/rejected": -3.6363296508789062, + "step": 1501 + }, + { + "epoch": 0.9809780390235938, + "grad_norm": 12.418015543507641, + "learning_rate": 1.640970202449382e-10, + "logits/chosen": -1.4305357933044434, + "logits/rejected": -1.4416850805282593, + "logps/chosen": -852.3341064453125, + "logps/rejected": -901.50537109375, + "loss": 0.4805, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.029092311859131, + "rewards/margins": 0.7821911573410034, + "rewards/rejected": -3.811283588409424, + "step": 1502 + }, + { + "epoch": 0.9816311535635562, + "grad_norm": 35.91459788005618, + "learning_rate": 1.52978888202826e-10, + "logits/chosen": -1.497631311416626, + "logits/rejected": -1.5434260368347168, + "logps/chosen": -787.7675170898438, + "logps/rejected": -954.6630249023438, + "loss": 0.4877, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.3874967098236084, + "rewards/margins": 1.1716786623001099, + "rewards/rejected": -4.55917501449585, + "step": 1503 + }, + { + "epoch": 0.9822842681035187, + "grad_norm": 15.545739037319393, + "learning_rate": 1.4225034472169217e-10, + "logits/chosen": -1.479112982749939, + "logits/rejected": -1.4773443937301636, + "logps/chosen": -777.0709228515625, + "logps/rejected": -973.9321899414062, + "loss": 0.4295, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.061960458755493, + "rewards/margins": 1.352473258972168, + "rewards/rejected": -4.414434432983398, + "step": 1504 + }, + { + "epoch": 0.9829373826434811, + "grad_norm": 10.049630463778348, + "learning_rate": 1.3191144564502488e-10, + "logits/chosen": -1.5108180046081543, + "logits/rejected": -1.474786400794983, + "logps/chosen": -910.9762573242188, + "logps/rejected": -990.8765869140625, + "loss": 0.5196, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5121772289276123, + "rewards/margins": 0.993119478225708, + "rewards/rejected": -4.5052971839904785, + "step": 1505 + }, + { + "epoch": 0.9835904971834436, + "grad_norm": 31.720266818268886, + "learning_rate": 1.2196224478814297e-10, + "logits/chosen": -1.4539670944213867, + "logits/rejected": -1.4328685998916626, + "logps/chosen": -865.9363403320312, + "logps/rejected": -964.106201171875, + "loss": 0.483, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5068650245666504, + "rewards/margins": 0.9375267028808594, + "rewards/rejected": -4.44439172744751, + "step": 1506 + }, + { + "epoch": 0.984243611723406, + "grad_norm": 45.556241819502084, + "learning_rate": 1.1240279393793795e-10, + "logits/chosen": -1.5094014406204224, + "logits/rejected": -1.4748339653015137, + "logps/chosen": -926.0755004882812, + "logps/rejected": -976.0172729492188, + "loss": 0.4813, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7970991134643555, + "rewards/margins": 0.9570468068122864, + "rewards/rejected": -4.754145622253418, + "step": 1507 + }, + { + "epoch": 0.9848967262633684, + "grad_norm": 40.10867343247857, + "learning_rate": 1.0323314285260731e-10, + "logits/chosen": -1.54648756980896, + "logits/rejected": -1.495444893836975, + "logps/chosen": -783.347900390625, + "logps/rejected": -871.718505859375, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.104883909225464, + "rewards/margins": 1.1070107221603394, + "rewards/rejected": -4.211894512176514, + "step": 1508 + }, + { + "epoch": 0.9855498408033309, + "grad_norm": 32.74695628540036, + "learning_rate": 9.445333926139665e-11, + "logits/chosen": -1.5436725616455078, + "logits/rejected": -1.5149645805358887, + "logps/chosen": -904.5325317382812, + "logps/rejected": -958.0336303710938, + "loss": 0.4634, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.427320957183838, + "rewards/margins": 0.8428292274475098, + "rewards/rejected": -4.270150661468506, + "step": 1509 + }, + { + "epoch": 0.9862029553432933, + "grad_norm": 50.87038917169162, + "learning_rate": 8.606342886432472e-11, + "logits/chosen": -1.5588243007659912, + "logits/rejected": -1.532243251800537, + "logps/chosen": -860.1405029296875, + "logps/rejected": -904.2451171875, + "loss": 0.4499, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0775833129882812, + "rewards/margins": 0.7431819438934326, + "rewards/rejected": -3.8207650184631348, + "step": 1510 + }, + { + "epoch": 0.9868560698832558, + "grad_norm": 29.345234968333465, + "learning_rate": 7.806345533197534e-11, + "logits/chosen": -1.5143674612045288, + "logits/rejected": -1.483849287033081, + "logps/chosen": -851.5826416015625, + "logps/rejected": -954.4253540039062, + "loss": 0.5093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5056495666503906, + "rewards/margins": 0.8994309902191162, + "rewards/rejected": -4.405080795288086, + "step": 1511 + }, + { + "epoch": 0.9875091844232182, + "grad_norm": 22.39474777978432, + "learning_rate": 7.045346030526423e-11, + "logits/chosen": -1.5250229835510254, + "logits/rejected": -1.4954551458358765, + "logps/chosen": -931.7249145507812, + "logps/rejected": -1065.670166015625, + "loss": 0.4391, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5478012561798096, + "rewards/margins": 1.2276500463485718, + "rewards/rejected": -4.77545166015625, + "step": 1512 + }, + { + "epoch": 0.9881622989631806, + "grad_norm": 68.548727751151, + "learning_rate": 6.323348339521418e-11, + "logits/chosen": -1.5496234893798828, + "logits/rejected": -1.5093997716903687, + "logps/chosen": -938.7208251953125, + "logps/rejected": -940.5839233398438, + "loss": 0.5334, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8004541397094727, + "rewards/margins": 0.6491421461105347, + "rewards/rejected": -4.449596405029297, + "step": 1513 + }, + { + "epoch": 0.9888154135031431, + "grad_norm": 15.003945051271025, + "learning_rate": 5.640356218274689e-11, + "logits/chosen": -1.4270832538604736, + "logits/rejected": -1.4271320104599, + "logps/chosen": -785.04052734375, + "logps/rejected": -928.6375732421875, + "loss": 0.5223, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0938949584960938, + "rewards/margins": 0.9844279885292053, + "rewards/rejected": -4.078322887420654, + "step": 1514 + }, + { + "epoch": 0.9894685280431056, + "grad_norm": 52.68067104199576, + "learning_rate": 4.996373221849981e-11, + "logits/chosen": -1.4737398624420166, + "logits/rejected": -1.478607177734375, + "logps/chosen": -859.705810546875, + "logps/rejected": -980.18701171875, + "loss": 0.4386, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2958946228027344, + "rewards/margins": 1.2142889499664307, + "rewards/rejected": -4.510182857513428, + "step": 1515 + }, + { + "epoch": 0.990121642583068, + "grad_norm": 41.105753672966415, + "learning_rate": 4.391402702263458e-11, + "logits/chosen": -1.5442856550216675, + "logits/rejected": -1.5355677604675293, + "logps/chosen": -766.7518920898438, + "logps/rejected": -807.2686157226562, + "loss": 0.4765, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.052950620651245, + "rewards/margins": 0.6595571041107178, + "rewards/rejected": -3.712507724761963, + "step": 1516 + }, + { + "epoch": 0.9907747571230304, + "grad_norm": 68.18799629848323, + "learning_rate": 3.82544780846622e-11, + "logits/chosen": -1.4942461252212524, + "logits/rejected": -1.4979506731033325, + "logps/chosen": -900.58935546875, + "logps/rejected": -1016.7969970703125, + "loss": 0.4941, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9103312492370605, + "rewards/margins": 0.9849377870559692, + "rewards/rejected": -4.89526891708374, + "step": 1517 + }, + { + "epoch": 0.9914278716629928, + "grad_norm": 20.36699675427797, + "learning_rate": 3.2985114863276484e-11, + "logits/chosen": -1.4821314811706543, + "logits/rejected": -1.4812616109848022, + "logps/chosen": -811.2876586914062, + "logps/rejected": -998.925048828125, + "loss": 0.5106, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.557675838470459, + "rewards/margins": 1.2705349922180176, + "rewards/rejected": -4.828211307525635, + "step": 1518 + }, + { + "epoch": 0.9920809862029554, + "grad_norm": 58.88999010926919, + "learning_rate": 2.810596478619587e-11, + "logits/chosen": -1.5028977394104004, + "logits/rejected": -1.4703741073608398, + "logps/chosen": -901.3570556640625, + "logps/rejected": -913.9461059570312, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.426830291748047, + "rewards/margins": 0.6236191391944885, + "rewards/rejected": -4.050449371337891, + "step": 1519 + }, + { + "epoch": 0.9927341007429178, + "grad_norm": 34.40888479605642, + "learning_rate": 2.3617053250046815e-11, + "logits/chosen": -1.5230963230133057, + "logits/rejected": -1.4947023391723633, + "logps/chosen": -770.20849609375, + "logps/rejected": -841.6630859375, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0242300033569336, + "rewards/margins": 0.8307550549507141, + "rewards/rejected": -3.854984760284424, + "step": 1520 + }, + { + "epoch": 0.9933872152828802, + "grad_norm": 22.3379376658494, + "learning_rate": 1.951840362018897e-11, + "logits/chosen": -1.5373893976211548, + "logits/rejected": -1.5419368743896484, + "logps/chosen": -797.622314453125, + "logps/rejected": -868.178955078125, + "loss": 0.4944, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.065598249435425, + "rewards/margins": 0.9640765190124512, + "rewards/rejected": -4.029674530029297, + "step": 1521 + }, + { + "epoch": 0.9940403298228426, + "grad_norm": 37.37852136541512, + "learning_rate": 1.5810037230648554e-11, + "logits/chosen": -1.4990514516830444, + "logits/rejected": -1.498454213142395, + "logps/chosen": -753.627685546875, + "logps/rejected": -842.1553344726562, + "loss": 0.4797, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.89300537109375, + "rewards/margins": 0.8533806204795837, + "rewards/rejected": -3.7463865280151367, + "step": 1522 + }, + { + "epoch": 0.9946934443628052, + "grad_norm": 35.34109312088976, + "learning_rate": 1.2491973383951803e-11, + "logits/chosen": -1.433802843093872, + "logits/rejected": -1.4264992475509644, + "logps/chosen": -843.60986328125, + "logps/rejected": -899.9165649414062, + "loss": 0.5295, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5381946563720703, + "rewards/margins": 0.6915282011032104, + "rewards/rejected": -4.229722499847412, + "step": 1523 + }, + { + "epoch": 0.9953465589027676, + "grad_norm": 25.436468352463912, + "learning_rate": 9.564229351050056e-12, + "logits/chosen": -1.4901500940322876, + "logits/rejected": -1.4961150884628296, + "logps/chosen": -803.0394287109375, + "logps/rejected": -855.164794921875, + "loss": 0.5505, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.957271099090576, + "rewards/margins": 0.79988694190979, + "rewards/rejected": -3.757158041000366, + "step": 1524 + }, + { + "epoch": 0.99599967344273, + "grad_norm": 10.73432201222575, + "learning_rate": 7.0268203712448015e-12, + "logits/chosen": -1.5934062004089355, + "logits/rejected": -1.544229507446289, + "logps/chosen": -856.8956298828125, + "logps/rejected": -1123.9691162109375, + "loss": 0.4846, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.3569083213806152, + "rewards/margins": 2.04354190826416, + "rewards/rejected": -5.400450706481934, + "step": 1525 + }, + { + "epoch": 0.9966527879826924, + "grad_norm": 87.38096016034348, + "learning_rate": 4.879759652079429e-12, + "logits/chosen": -1.467991828918457, + "logits/rejected": -1.427109718322754, + "logps/chosen": -805.0746459960938, + "logps/rejected": -918.366455078125, + "loss": 0.5265, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3931772708892822, + "rewards/margins": 0.822685956954956, + "rewards/rejected": -4.215863227844238, + "step": 1526 + }, + { + "epoch": 0.9973059025226549, + "grad_norm": 24.30638520435916, + "learning_rate": 3.123058369280951e-12, + "logits/chosen": -1.4472224712371826, + "logits/rejected": -1.4677808284759521, + "logps/chosen": -832.0319213867188, + "logps/rejected": -963.2366333007812, + "loss": 0.5076, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1619575023651123, + "rewards/margins": 0.8776848912239075, + "rewards/rejected": -4.039642333984375, + "step": 1527 + }, + { + "epoch": 0.9979590170626174, + "grad_norm": 17.20772991475794, + "learning_rate": 1.756725666710035e-12, + "logits/chosen": -1.4970341920852661, + "logits/rejected": -1.4706482887268066, + "logps/chosen": -862.6663818359375, + "logps/rejected": -1025.07177734375, + "loss": 0.4429, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1549482345581055, + "rewards/margins": 1.3957924842834473, + "rewards/rejected": -4.550740718841553, + "step": 1528 + }, + { + "epoch": 0.9986121316025798, + "grad_norm": 17.972242666702652, + "learning_rate": 7.80768656319375e-13, + "logits/chosen": -1.409681797027588, + "logits/rejected": -1.3731991052627563, + "logps/chosen": -763.9820556640625, + "logps/rejected": -802.5194702148438, + "loss": 0.4251, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1270015239715576, + "rewards/margins": 0.6430358290672302, + "rewards/rejected": -3.7700371742248535, + "step": 1529 + }, + { + "epoch": 0.9992652461425422, + "grad_norm": 30.620744387516318, + "learning_rate": 1.9519241807874897e-13, + "logits/chosen": -1.5083037614822388, + "logits/rejected": -1.4722343683242798, + "logps/chosen": -971.5647583007812, + "logps/rejected": -995.6734619140625, + "loss": 0.4784, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.024622440338135, + "rewards/margins": 0.6361601948738098, + "rewards/rejected": -4.660782814025879, + "step": 1530 + }, + { + "epoch": 0.9999183606825047, + "grad_norm": 25.960604541088838, + "learning_rate": 0.0, + "logits/chosen": -1.506800651550293, + "logits/rejected": -1.5056391954421997, + "logps/chosen": -892.1127319335938, + "logps/rejected": -1093.7352294921875, + "loss": 0.4507, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.275479793548584, + "rewards/margins": 1.1585028171539307, + "rewards/rejected": -4.433982849121094, + "step": 1531 + }, + { + "epoch": 0.9999183606825047, + "step": 1531, + "total_flos": 0.0, + "train_loss": 0.5524635882372952, + "train_runtime": 40329.8241, + "train_samples_per_second": 4.859, + "train_steps_per_second": 0.038 + } + ], + "logging_steps": 1, + "max_steps": 1531, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}