{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.361328125, "learning_rate": 1.893939393939394e-09, "logits/chosen": -1.4715663194656372, "logits/rejected": -0.9266279935836792, "logps/chosen": -194.24078369140625, "logps/rejected": -198.9897003173828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.29296875, "learning_rate": 1.893939393939394e-08, "logits/chosen": -1.477033019065857, "logits/rejected": -1.0564019680023193, "logps/chosen": -209.50955200195312, "logps/rejected": -199.38864135742188, "loss": 0.693, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.00017503734852652997, "rewards/margins": -0.0007337426068261266, "rewards/margins_max": 0.0013889693655073643, "rewards/margins_min": -0.002856454811990261, "rewards/margins_std": 0.003001968376338482, "rewards/rejected": 0.0009087801445275545, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.291015625, "learning_rate": 3.787878787878788e-08, "logits/chosen": -1.4086185693740845, "logits/rejected": -0.9495820999145508, "logps/chosen": -248.8169403076172, "logps/rejected": -228.78634643554688, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0003385419549886137, "rewards/margins": -0.0008140860009007156, "rewards/margins_max": 0.0013392677064985037, "rewards/margins_min": -0.002967439591884613, "rewards/margins_std": 0.0030453018844127655, "rewards/rejected": 0.0004755440168082714, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.341796875, "learning_rate": 5.6818181818181815e-08, "logits/chosen": -1.3442885875701904, "logits/rejected": -1.0605539083480835, "logps/chosen": -199.19622802734375, "logps/rejected": -209.0339813232422, "loss": 0.6934, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.300643791386392e-05, "rewards/margins": -0.0006394138326868415, "rewards/margins_max": 0.0013273811200633645, "rewards/margins_min": -0.0026062086690217257, "rewards/margins_std": 0.0027814677450805902, "rewards/rejected": 0.0006164073711261153, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.326171875, "learning_rate": 7.575757575757576e-08, "logits/chosen": -1.3370082378387451, "logits/rejected": -1.105715036392212, "logps/chosen": -209.1099090576172, "logps/rejected": -232.79086303710938, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.00016671940102241933, "rewards/margins": -0.0002557897532824427, "rewards/margins_max": 0.0017397021874785423, "rewards/margins_min": -0.0022512818686664104, "rewards/margins_std": 0.002822051988914609, "rewards/rejected": 0.000422509154304862, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.2734375, "learning_rate": 9.469696969696969e-08, "logits/chosen": -1.3629438877105713, "logits/rejected": -1.057975172996521, "logps/chosen": -231.24758911132812, "logps/rejected": -239.5913543701172, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0008590269717387855, "rewards/margins": 0.0009242333471775055, "rewards/margins_max": 0.003584180725738406, "rewards/margins_min": -0.0017357139149680734, "rewards/margins_std": 0.0037617336492985487, "rewards/rejected": -6.520649912999943e-05, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.28515625, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -1.3622150421142578, "logits/rejected": -1.0315033197402954, "logps/chosen": -199.6304473876953, "logps/rejected": -212.83926391601562, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0002507457393221557, "rewards/margins": -0.0001059188725776039, "rewards/margins_max": 0.002144067781046033, "rewards/margins_min": -0.0023559057153761387, "rewards/margins_std": 0.0031819615978747606, "rewards/rejected": -0.0001448269176762551, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.28125, "learning_rate": 1.3257575757575757e-07, "logits/chosen": -1.379127860069275, "logits/rejected": -1.1307542324066162, "logps/chosen": -185.8495635986328, "logps/rejected": -207.99673461914062, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.000694580958224833, "rewards/margins": 0.00036811447353102267, "rewards/margins_max": 0.00218455889262259, "rewards/margins_min": -0.0014483298873528838, "rewards/margins_std": 0.002568840514868498, "rewards/rejected": 0.0003264665720053017, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.25390625, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -1.3772165775299072, "logits/rejected": -1.1073375940322876, "logps/chosen": -199.511474609375, "logps/rejected": -206.6280059814453, "loss": 0.6925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0008196959970518947, "rewards/margins": 0.0012880888534709811, "rewards/margins_max": 0.003329794155433774, "rewards/margins_min": -0.000753616273868829, "rewards/margins_std": 0.002887406852096319, "rewards/rejected": -0.00046839285641908646, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.259765625, "learning_rate": 1.7045454545454543e-07, "logits/chosen": -1.4906318187713623, "logits/rejected": -1.1894948482513428, "logps/chosen": -204.3669891357422, "logps/rejected": -220.7799072265625, "loss": 0.692, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0020121335983276367, "rewards/margins": 0.002966915722936392, "rewards/margins_max": 0.005120046902447939, "rewards/margins_min": 0.0008137855911627412, "rewards/margins_std": 0.0030449863988906145, "rewards/rejected": -0.0009547824738547206, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.318359375, "learning_rate": 1.8939393939393938e-07, "logits/chosen": -1.436471939086914, "logits/rejected": -1.124845266342163, "logps/chosen": -206.4337615966797, "logps/rejected": -215.1757049560547, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0012693710159510374, "rewards/margins": 0.0008539790287613869, "rewards/margins_max": 0.0031065032817423344, "rewards/margins_min": -0.0013985451078042388, "rewards/margins_std": 0.0031855504494160414, "rewards/rejected": 0.0004153919289819896, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.244140625, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.4335253238677979, "logits/rejected": -1.1322035789489746, "logps/chosen": -193.65826416015625, "logps/rejected": -200.38711547851562, "loss": 0.6924, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0013359611621126533, "rewards/margins": 0.0015128880040720105, "rewards/margins_max": 0.004125602543354034, "rewards/margins_min": -0.001099826768040657, "rewards/margins_std": 0.003694936167448759, "rewards/rejected": -0.00017692662368062884, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.2431640625, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -1.3395394086837769, "logits/rejected": -1.0315684080123901, "logps/chosen": -203.34072875976562, "logps/rejected": -192.33340454101562, "loss": 0.6922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0007492561126127839, "rewards/margins": 0.001645550481043756, "rewards/margins_max": 0.0036673967260867357, "rewards/margins_min": -0.00037629506550729275, "rewards/margins_std": 0.002859321655705571, "rewards/rejected": -0.0008962946012616158, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.3203125, "learning_rate": 2.462121212121212e-07, "logits/chosen": -1.4741287231445312, "logits/rejected": -1.0452475547790527, "logps/chosen": -252.58242797851562, "logps/rejected": -234.0312042236328, "loss": 0.6915, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.002638085512444377, "rewards/margins": 0.0036562737077474594, "rewards/margins_max": 0.0064610885456204414, "rewards/margins_min": 0.0008514595101587474, "rewards/margins_std": 0.003966606222093105, "rewards/rejected": -0.0010181884281337261, "step": 130 }, { "epoch": 0.05, "grad_norm": 0.28125, "learning_rate": 2.6515151515151514e-07, "logits/chosen": -1.3656947612762451, "logits/rejected": -1.044594407081604, "logps/chosen": -206.4570770263672, "logps/rejected": -211.5678253173828, "loss": 0.6914, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0019431791733950377, "rewards/margins": 0.0030987593345344067, "rewards/margins_max": 0.0053262365981936455, "rewards/margins_min": 0.000871282652951777, "rewards/margins_std": 0.0031501282937824726, "rewards/rejected": -0.0011555805103853345, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.267578125, "learning_rate": 2.840909090909091e-07, "logits/chosen": -1.3692163228988647, "logits/rejected": -1.0668622255325317, "logps/chosen": -197.52001953125, "logps/rejected": -231.03829956054688, "loss": 0.6911, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0020190435461699963, "rewards/margins": 0.004240790382027626, "rewards/margins_max": 0.007745341397821903, "rewards/margins_min": 0.0007362383184954524, "rewards/margins_std": 0.004956183955073357, "rewards/rejected": -0.0022217463701963425, "step": 150 }, { "epoch": 0.06, "grad_norm": 0.3359375, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -1.3538802862167358, "logits/rejected": -1.2184001207351685, "logps/chosen": -175.86048889160156, "logps/rejected": -229.12576293945312, "loss": 0.6912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0017691183602437377, "rewards/margins": 0.0037867154460400343, "rewards/margins_max": 0.006128127686679363, "rewards/margins_min": 0.0014453029725700617, "rewards/margins_std": 0.0033112571109086275, "rewards/rejected": -0.0020175972022116184, "step": 160 }, { "epoch": 0.06, "grad_norm": 0.439453125, "learning_rate": 3.2196969696969695e-07, "logits/chosen": -1.4180030822753906, "logits/rejected": -1.0590273141860962, "logps/chosen": -199.34068298339844, "logps/rejected": -224.4752197265625, "loss": 0.6906, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0037923683412373066, "rewards/margins": 0.005361521150916815, "rewards/margins_max": 0.008897420018911362, "rewards/margins_min": 0.0018256225157529116, "rewards/margins_std": 0.005000515840947628, "rewards/rejected": -0.0015691530425101519, "step": 170 }, { "epoch": 0.07, "grad_norm": 0.42578125, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -1.4597827196121216, "logits/rejected": -1.124783992767334, "logps/chosen": -210.7503204345703, "logps/rejected": -233.1248779296875, "loss": 0.6901, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.003537885844707489, "rewards/margins": 0.0069324844516813755, "rewards/margins_max": 0.009725996293127537, "rewards/margins_min": 0.004138973541557789, "rewards/margins_std": 0.0039506214670836926, "rewards/rejected": -0.00339459883980453, "step": 180 }, { "epoch": 0.07, "grad_norm": 0.34375, "learning_rate": 3.5984848484848486e-07, "logits/chosen": -1.431032419204712, "logits/rejected": -1.0481399297714233, "logps/chosen": -221.9123077392578, "logps/rejected": -223.14517211914062, "loss": 0.6896, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.002624739659950137, "rewards/margins": 0.006167866289615631, "rewards/margins_max": 0.009970493614673615, "rewards/margins_min": 0.002365240128710866, "rewards/margins_std": 0.005377725698053837, "rewards/rejected": -0.0035431268624961376, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.310546875, "learning_rate": 3.7878787878787876e-07, "logits/chosen": -1.3933680057525635, "logits/rejected": -1.0791442394256592, "logps/chosen": -216.5106964111328, "logps/rejected": -220.7644500732422, "loss": 0.6894, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0041386038064956665, "rewards/margins": 0.00709198322147131, "rewards/margins_max": 0.011054454371333122, "rewards/margins_min": 0.0031295125372707844, "rewards/margins_std": 0.005603780038654804, "rewards/rejected": -0.002953379647806287, "step": 200 }, { "epoch": 0.08, "grad_norm": 0.296875, "learning_rate": 3.977272727272727e-07, "logits/chosen": -1.3440656661987305, "logits/rejected": -1.1712138652801514, "logps/chosen": -197.05728149414062, "logps/rejected": -211.1956787109375, "loss": 0.6893, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.005659022368490696, "rewards/margins": 0.008139314129948616, "rewards/margins_max": 0.011830927804112434, "rewards/margins_min": 0.004447699058800936, "rewards/margins_std": 0.005220732185989618, "rewards/rejected": -0.0024802912957966328, "step": 210 }, { "epoch": 0.08, "grad_norm": 0.28515625, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.4657204151153564, "logits/rejected": -1.0755724906921387, "logps/chosen": -217.37124633789062, "logps/rejected": -216.32089233398438, "loss": 0.6889, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.005143741611391306, "rewards/margins": 0.009008055552840233, "rewards/margins_max": 0.015119703486561775, "rewards/margins_min": 0.0028964090161025524, "rewards/margins_std": 0.008643174543976784, "rewards/rejected": -0.0038643144071102142, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.267578125, "learning_rate": 4.3560606060606057e-07, "logits/chosen": -1.480959177017212, "logits/rejected": -1.1383044719696045, "logps/chosen": -206.03359985351562, "logps/rejected": -221.08889770507812, "loss": 0.6882, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.005945052020251751, "rewards/margins": 0.009998776949942112, "rewards/margins_max": 0.013563087210059166, "rewards/margins_min": 0.006434465758502483, "rewards/margins_std": 0.0050406972877681255, "rewards/rejected": -0.004053723998367786, "step": 230 }, { "epoch": 0.09, "grad_norm": 0.3046875, "learning_rate": 4.545454545454545e-07, "logits/chosen": -1.4084275960922241, "logits/rejected": -1.198710560798645, "logps/chosen": -195.56588745117188, "logps/rejected": -214.75173950195312, "loss": 0.6881, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0056480844505131245, "rewards/margins": 0.010742878541350365, "rewards/margins_max": 0.016357477754354477, "rewards/margins_min": 0.005128280725330114, "rewards/margins_std": 0.007940240204334259, "rewards/rejected": -0.00509479409083724, "step": 240 }, { "epoch": 0.09, "grad_norm": 0.294921875, "learning_rate": 4.734848484848485e-07, "logits/chosen": -1.439551830291748, "logits/rejected": -1.1524641513824463, "logps/chosen": -189.01382446289062, "logps/rejected": -229.9130859375, "loss": 0.6866, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.005949309095740318, "rewards/margins": 0.012400015257298946, "rewards/margins_max": 0.018142709508538246, "rewards/margins_min": 0.006657321937382221, "rewards/margins_std": 0.008121393620967865, "rewards/rejected": -0.0064507052302360535, "step": 250 }, { "epoch": 0.1, "grad_norm": 0.25390625, "learning_rate": 4.924242424242424e-07, "logits/chosen": -1.529597520828247, "logits/rejected": -1.2286248207092285, "logps/chosen": -176.6389617919922, "logps/rejected": -206.6012725830078, "loss": 0.6865, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.007012097630649805, "rewards/margins": 0.012566355988383293, "rewards/margins_max": 0.01809503510594368, "rewards/margins_min": 0.007037677802145481, "rewards/margins_std": 0.007818731479346752, "rewards/rejected": -0.005554257892072201, "step": 260 }, { "epoch": 0.1, "grad_norm": 0.298828125, "learning_rate": 4.999920796099437e-07, "logits/chosen": -1.4192304611206055, "logits/rejected": -1.127290964126587, "logps/chosen": -220.34561157226562, "logps/rejected": -223.0558624267578, "loss": 0.6848, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.00964759849011898, "rewards/margins": 0.01655970886349678, "rewards/margins_max": 0.023178983479738235, "rewards/margins_min": 0.0099404351785779, "rewards/margins_std": 0.009361067786812782, "rewards/rejected": -0.006912109907716513, "step": 270 }, { "epoch": 0.11, "grad_norm": 0.296875, "learning_rate": 4.999436790436923e-07, "logits/chosen": -1.4525351524353027, "logits/rejected": -1.1418185234069824, "logps/chosen": -198.59756469726562, "logps/rejected": -224.8717041015625, "loss": 0.6854, "rewards/accuracies": 0.875, "rewards/chosen": 0.006603570189327002, "rewards/margins": 0.014784199185669422, "rewards/margins_max": 0.022250749170780182, "rewards/margins_min": 0.00731764966621995, "rewards/margins_std": 0.010559295304119587, "rewards/rejected": -0.008180629462003708, "step": 280 }, { "epoch": 0.11, "grad_norm": 0.322265625, "learning_rate": 4.998512866364003e-07, "logits/chosen": -1.3750841617584229, "logits/rejected": -1.1162524223327637, "logps/chosen": -196.96287536621094, "logps/rejected": -234.20751953125, "loss": 0.6844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.00851917453110218, "rewards/margins": 0.01636343263089657, "rewards/margins_max": 0.024043789133429527, "rewards/margins_min": 0.008683075197041035, "rewards/margins_std": 0.010861665941774845, "rewards/rejected": -0.007844258099794388, "step": 290 }, { "epoch": 0.11, "grad_norm": 0.244140625, "learning_rate": 4.997149186497795e-07, "logits/chosen": -1.3377901315689087, "logits/rejected": -1.0969083309173584, "logps/chosen": -211.07064819335938, "logps/rejected": -216.2685546875, "loss": 0.6834, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.007954636588692665, "rewards/margins": 0.01811050996184349, "rewards/margins_max": 0.0279003344476223, "rewards/margins_min": 0.008320683613419533, "rewards/margins_std": 0.013844907283782959, "rewards/rejected": -0.0101558743044734, "step": 300 }, { "epoch": 0.12, "grad_norm": 0.294921875, "learning_rate": 4.995345990855521e-07, "logits/chosen": -1.3734376430511475, "logits/rejected": -1.094660997390747, "logps/chosen": -209.6073760986328, "logps/rejected": -222.2782745361328, "loss": 0.6834, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.010066825896501541, "rewards/margins": 0.02010076865553856, "rewards/margins_max": 0.029314884915947914, "rewards/margins_min": 0.01088665146380663, "rewards/margins_std": 0.013030730187892914, "rewards/rejected": -0.010033941827714443, "step": 310 }, { "epoch": 0.12, "grad_norm": 0.28515625, "learning_rate": 4.993103596812268e-07, "logits/chosen": -1.3820875883102417, "logits/rejected": -1.1262398958206177, "logps/chosen": -197.0897674560547, "logps/rejected": -210.62136840820312, "loss": 0.6821, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011968965642154217, "rewards/margins": 0.024391215294599533, "rewards/margins_max": 0.03280986472964287, "rewards/margins_min": 0.015972565859556198, "rewards/margins_std": 0.011905769817531109, "rewards/rejected": -0.012422251515090466, "step": 320 }, { "epoch": 0.13, "grad_norm": 0.330078125, "learning_rate": 4.990422399045117e-07, "logits/chosen": -1.4338971376419067, "logits/rejected": -1.1740996837615967, "logps/chosen": -192.77890014648438, "logps/rejected": -227.9630126953125, "loss": 0.6817, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012633833102881908, "rewards/margins": 0.0226894561201334, "rewards/margins_max": 0.03381115198135376, "rewards/margins_min": 0.01156776025891304, "rewards/margins_std": 0.015728455036878586, "rewards/rejected": -0.010055623017251492, "step": 330 }, { "epoch": 0.13, "grad_norm": 0.291015625, "learning_rate": 4.987302869463687e-07, "logits/chosen": -1.3690948486328125, "logits/rejected": -1.02110755443573, "logps/chosen": -216.27938842773438, "logps/rejected": -252.06924438476562, "loss": 0.6806, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.013316566124558449, "rewards/margins": 0.026786301285028458, "rewards/margins_max": 0.03776715695858002, "rewards/margins_min": 0.015805436298251152, "rewards/margins_std": 0.01552928052842617, "rewards/rejected": -0.01346973329782486, "step": 340 }, { "epoch": 0.13, "grad_norm": 0.3515625, "learning_rate": 4.98374555712707e-07, "logits/chosen": -1.3813257217407227, "logits/rejected": -1.0808042287826538, "logps/chosen": -202.20233154296875, "logps/rejected": -234.57485961914062, "loss": 0.6797, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012134606018662453, "rewards/margins": 0.031434256583452225, "rewards/margins_max": 0.04257182776927948, "rewards/margins_min": 0.020296679809689522, "rewards/margins_std": 0.015750911086797714, "rewards/rejected": -0.019299646839499474, "step": 350 }, { "epoch": 0.14, "grad_norm": 0.29296875, "learning_rate": 4.979751088147191e-07, "logits/chosen": -1.3914369344711304, "logits/rejected": -1.2104722261428833, "logps/chosen": -229.1161651611328, "logps/rejected": -233.09030151367188, "loss": 0.6786, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012462800368666649, "rewards/margins": 0.027766436338424683, "rewards/margins_max": 0.03918435052037239, "rewards/margins_min": 0.016348522156476974, "rewards/margins_std": 0.016147367656230927, "rewards/rejected": -0.015303634107112885, "step": 360 }, { "epoch": 0.14, "grad_norm": 0.2734375, "learning_rate": 4.97532016557862e-07, "logits/chosen": -1.4021486043930054, "logits/rejected": -1.1357289552688599, "logps/chosen": -194.69229125976562, "logps/rejected": -206.661865234375, "loss": 0.6797, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.010312230326235294, "rewards/margins": 0.025726106017827988, "rewards/margins_max": 0.036852724850177765, "rewards/margins_min": 0.01459948904812336, "rewards/margins_std": 0.015735412016510963, "rewards/rejected": -0.015413874760270119, "step": 370 }, { "epoch": 0.14, "grad_norm": 0.326171875, "learning_rate": 4.970453569294811e-07, "logits/chosen": -1.336107611656189, "logits/rejected": -1.0358856916427612, "logps/chosen": -199.49639892578125, "logps/rejected": -212.7289276123047, "loss": 0.6787, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013159600086510181, "rewards/margins": 0.030392413958907127, "rewards/margins_max": 0.04219727963209152, "rewards/margins_min": 0.018587548285722733, "rewards/margins_std": 0.016694601625204086, "rewards/rejected": -0.01723281480371952, "step": 380 }, { "epoch": 0.15, "grad_norm": 0.25390625, "learning_rate": 4.965152155850854e-07, "logits/chosen": -1.447141170501709, "logits/rejected": -1.1050375699996948, "logps/chosen": -208.603271484375, "logps/rejected": -216.28231811523438, "loss": 0.6774, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012095400132238865, "rewards/margins": 0.02805733121931553, "rewards/margins_max": 0.043057795614004135, "rewards/margins_min": 0.013056864961981773, "rewards/margins_std": 0.02121386118233204, "rewards/rejected": -0.01596193201839924, "step": 390 }, { "epoch": 0.15, "grad_norm": 0.26171875, "learning_rate": 4.959416858332709e-07, "logits/chosen": -1.3536365032196045, "logits/rejected": -1.0935043096542358, "logps/chosen": -178.9142608642578, "logps/rejected": -206.5353546142578, "loss": 0.6742, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015787001699209213, "rewards/margins": 0.03439975157380104, "rewards/margins_max": 0.047203429043293, "rewards/margins_min": 0.021596072241663933, "rewards/margins_std": 0.018107129260897636, "rewards/rejected": -0.01861274614930153, "step": 400 }, { "epoch": 0.16, "grad_norm": 0.2734375, "learning_rate": 4.953248686192974e-07, "logits/chosen": -1.404972791671753, "logits/rejected": -1.0352896451950073, "logps/chosen": -212.09304809570312, "logps/rejected": -224.814697265625, "loss": 0.6772, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014694595709443092, "rewards/margins": 0.032047249376773834, "rewards/margins_max": 0.04682258516550064, "rewards/margins_min": 0.01727190613746643, "rewards/margins_std": 0.020895490422844887, "rewards/rejected": -0.017352653667330742, "step": 410 }, { "epoch": 0.16, "grad_norm": 0.294921875, "learning_rate": 4.946648725073222e-07, "logits/chosen": -1.4838746786117554, "logits/rejected": -1.2245352268218994, "logps/chosen": -212.3232421875, "logps/rejected": -218.89013671875, "loss": 0.6757, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.016488736495375633, "rewards/margins": 0.03431572765111923, "rewards/margins_max": 0.04685080051422119, "rewards/margins_min": 0.02178065851330757, "rewards/margins_std": 0.017727266997098923, "rewards/rejected": -0.017826993018388748, "step": 420 }, { "epoch": 0.16, "grad_norm": 0.29296875, "learning_rate": 4.93961813661291e-07, "logits/chosen": -1.5144537687301636, "logits/rejected": -1.2026809453964233, "logps/chosen": -191.7454071044922, "logps/rejected": -203.25830078125, "loss": 0.676, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014662086963653564, "rewards/margins": 0.03749576956033707, "rewards/margins_max": 0.05218175798654556, "rewards/margins_min": 0.022809788584709167, "rewards/margins_std": 0.020769115537405014, "rewards/rejected": -0.022833682596683502, "step": 430 }, { "epoch": 0.17, "grad_norm": 0.31640625, "learning_rate": 4.932158158244936e-07, "logits/chosen": -1.460267186164856, "logits/rejected": -1.102550745010376, "logps/chosen": -219.0205841064453, "logps/rejected": -230.55673217773438, "loss": 0.6748, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01621498540043831, "rewards/margins": 0.03951702266931534, "rewards/margins_max": 0.05593367666006088, "rewards/margins_min": 0.023100370541214943, "rewards/margins_std": 0.023216653615236282, "rewards/rejected": -0.02330203540623188, "step": 440 }, { "epoch": 0.17, "grad_norm": 0.287109375, "learning_rate": 4.924270102977827e-07, "logits/chosen": -1.4851996898651123, "logits/rejected": -1.1021556854248047, "logps/chosen": -212.36428833007812, "logps/rejected": -234.95278930664062, "loss": 0.6729, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01592712476849556, "rewards/margins": 0.040354181081056595, "rewards/margins_max": 0.05639289692044258, "rewards/margins_min": 0.02431546524167061, "rewards/margins_std": 0.022682171314954758, "rewards/rejected": -0.024427054449915886, "step": 450 }, { "epoch": 0.17, "grad_norm": 0.298828125, "learning_rate": 4.915955359164651e-07, "logits/chosen": -1.4281965494155884, "logits/rejected": -1.1299288272857666, "logps/chosen": -203.64056396484375, "logps/rejected": -209.16561889648438, "loss": 0.6725, "rewards/accuracies": 1.0, "rewards/chosen": 0.014464335516095161, "rewards/margins": 0.03879351541399956, "rewards/margins_max": 0.05169098451733589, "rewards/margins_min": 0.025896048173308372, "rewards/margins_std": 0.01823977194726467, "rewards/rejected": -0.024329179897904396, "step": 460 }, { "epoch": 0.18, "grad_norm": 0.3203125, "learning_rate": 4.907215390258652e-07, "logits/chosen": -1.3715511560440063, "logits/rejected": -1.1071712970733643, "logps/chosen": -186.81532287597656, "logps/rejected": -208.74130249023438, "loss": 0.6714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01985536888241768, "rewards/margins": 0.04831348732113838, "rewards/margins_max": 0.07152094691991806, "rewards/margins_min": 0.025106023997068405, "rewards/margins_std": 0.03282030671834946, "rewards/rejected": -0.028458122164011, "step": 470 }, { "epoch": 0.18, "grad_norm": 0.294921875, "learning_rate": 4.898051734555674e-07, "logits/chosen": -1.520582675933838, "logits/rejected": -1.1530619859695435, "logps/chosen": -226.6740264892578, "logps/rejected": -220.93807983398438, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.01769147627055645, "rewards/margins": 0.04557216167449951, "rewards/margins_max": 0.06148766726255417, "rewards/margins_min": 0.029656657949090004, "rewards/margins_std": 0.0225079283118248, "rewards/rejected": -0.02788068726658821, "step": 480 }, { "epoch": 0.19, "grad_norm": 0.349609375, "learning_rate": 4.888466004923412e-07, "logits/chosen": -1.4570837020874023, "logits/rejected": -1.154343605041504, "logps/chosen": -241.1246795654297, "logps/rejected": -217.9856719970703, "loss": 0.6717, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01621456816792488, "rewards/margins": 0.039181966334581375, "rewards/margins_max": 0.056494325399398804, "rewards/margins_min": 0.021869609132409096, "rewards/margins_std": 0.024483371526002884, "rewards/rejected": -0.022967400029301643, "step": 490 }, { "epoch": 0.19, "grad_norm": 0.322265625, "learning_rate": 4.878459888517532e-07, "logits/chosen": -1.3344757556915283, "logits/rejected": -1.1008737087249756, "logps/chosen": -204.79550170898438, "logps/rejected": -226.4160614013672, "loss": 0.6695, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01746196672320366, "rewards/margins": 0.0481671541929245, "rewards/margins_max": 0.06977338343858719, "rewards/margins_min": 0.02656092867255211, "rewards/margins_std": 0.03055582009255886, "rewards/rejected": -0.03070518746972084, "step": 500 }, { "epoch": 0.19, "grad_norm": 0.296875, "learning_rate": 4.86803514648472e-07, "logits/chosen": -1.5273072719573975, "logits/rejected": -1.1597423553466797, "logps/chosen": -220.5069580078125, "logps/rejected": -226.74734497070312, "loss": 0.6702, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02024765871465206, "rewards/margins": 0.042053740471601486, "rewards/margins_max": 0.062348585575819016, "rewards/margins_min": 0.02175888977944851, "rewards/margins_std": 0.02870125137269497, "rewards/rejected": -0.021806079894304276, "step": 510 }, { "epoch": 0.2, "grad_norm": 0.326171875, "learning_rate": 4.85719361365271e-07, "logits/chosen": -1.3967102766036987, "logits/rejected": -1.3165340423583984, "logps/chosen": -196.89349365234375, "logps/rejected": -268.110595703125, "loss": 0.6671, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015946678817272186, "rewards/margins": 0.05510186403989792, "rewards/margins_max": 0.07911844551563263, "rewards/margins_min": 0.03108527697622776, "rewards/margins_std": 0.03396458178758621, "rewards/rejected": -0.03915518522262573, "step": 520 }, { "epoch": 0.2, "grad_norm": 0.306640625, "learning_rate": 4.845937198207342e-07, "logits/chosen": -1.3531572818756104, "logits/rejected": -1.0980560779571533, "logps/chosen": -193.45533752441406, "logps/rejected": -219.8807830810547, "loss": 0.6693, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020253274589776993, "rewards/margins": 0.052356742322444916, "rewards/margins_max": 0.07621929049491882, "rewards/margins_min": 0.02849418856203556, "rewards/margins_std": 0.03374674171209335, "rewards/rejected": -0.03210346773266792, "step": 530 }, { "epoch": 0.21, "grad_norm": 0.28515625, "learning_rate": 4.834267881356707e-07, "logits/chosen": -1.4308512210845947, "logits/rejected": -1.106432557106018, "logps/chosen": -192.52944946289062, "logps/rejected": -235.0094757080078, "loss": 0.6655, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0184812992811203, "rewards/margins": 0.05686504766345024, "rewards/margins_max": 0.08549021184444427, "rewards/margins_min": 0.028239887207746506, "rewards/margins_std": 0.04048209637403488, "rewards/rejected": -0.03838375210762024, "step": 540 }, { "epoch": 0.21, "grad_norm": 0.318359375, "learning_rate": 4.822187716982439e-07, "logits/chosen": -1.4813454151153564, "logits/rejected": -1.1552975177764893, "logps/chosen": -209.415283203125, "logps/rejected": -203.3141632080078, "loss": 0.6691, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015650704503059387, "rewards/margins": 0.052040062844753265, "rewards/margins_max": 0.07295672595500946, "rewards/margins_min": 0.031123405322432518, "rewards/margins_std": 0.0295806173235178, "rewards/rejected": -0.03638935834169388, "step": 550 }, { "epoch": 0.21, "grad_norm": 0.271484375, "learning_rate": 4.809698831278217e-07, "logits/chosen": -1.3834329843521118, "logits/rejected": -1.0720294713974, "logps/chosen": -212.3925018310547, "logps/rejected": -222.80795288085938, "loss": 0.6658, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02054109051823616, "rewards/margins": 0.05672860145568848, "rewards/margins_max": 0.07977689057588577, "rewards/margins_min": 0.033680304884910583, "rewards/margins_std": 0.032595209777355194, "rewards/rejected": -0.03618750721216202, "step": 560 }, { "epoch": 0.22, "grad_norm": 0.294921875, "learning_rate": 4.796803422375544e-07, "logits/chosen": -1.4147670269012451, "logits/rejected": -1.0929642915725708, "logps/chosen": -210.1922607421875, "logps/rejected": -212.7117462158203, "loss": 0.6672, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.018465718254446983, "rewards/margins": 0.04878971725702286, "rewards/margins_max": 0.06830336898565292, "rewards/margins_min": 0.029276061803102493, "rewards/margins_std": 0.027596473693847656, "rewards/rejected": -0.030323997139930725, "step": 570 }, { "epoch": 0.22, "grad_norm": 0.26953125, "learning_rate": 4.783503759956858e-07, "logits/chosen": -1.4458619356155396, "logits/rejected": -1.0822944641113281, "logps/chosen": -219.31643676757812, "logps/rejected": -230.5920867919922, "loss": 0.6654, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01647743210196495, "rewards/margins": 0.058020107448101044, "rewards/margins_max": 0.08135350048542023, "rewards/margins_min": 0.03468669205904007, "rewards/margins_std": 0.03299842029809952, "rewards/rejected": -0.041542667895555496, "step": 580 }, { "epoch": 0.22, "grad_norm": 0.330078125, "learning_rate": 4.769802184856049e-07, "logits/chosen": -1.3940141201019287, "logits/rejected": -1.0562171936035156, "logps/chosen": -200.2158660888672, "logps/rejected": -233.1265106201172, "loss": 0.6672, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.020147614181041718, "rewards/margins": 0.05223570391535759, "rewards/margins_max": 0.07756931334733963, "rewards/margins_min": 0.026902100071310997, "rewards/margins_std": 0.03582713380455971, "rewards/rejected": -0.03208809345960617, "step": 590 }, { "epoch": 0.23, "grad_norm": 0.3046875, "learning_rate": 4.7557011086464625e-07, "logits/chosen": -1.4117199182510376, "logits/rejected": -1.1517552137374878, "logps/chosen": -195.2522735595703, "logps/rejected": -215.0863800048828, "loss": 0.6662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022823641076683998, "rewards/margins": 0.05536242574453354, "rewards/margins_max": 0.07616210728883743, "rewards/margins_min": 0.03456275910139084, "rewards/margins_std": 0.029415175318717957, "rewards/rejected": -0.03253878653049469, "step": 600 }, { "epoch": 0.23, "grad_norm": 0.322265625, "learning_rate": 4.74120301321644e-07, "logits/chosen": -1.4161319732666016, "logits/rejected": -1.0721873044967651, "logps/chosen": -213.9852752685547, "logps/rejected": -222.2463836669922, "loss": 0.6634, "rewards/accuracies": 0.9375, "rewards/chosen": 0.020179515704512596, "rewards/margins": 0.06596876680850983, "rewards/margins_max": 0.08490391820669174, "rewards/margins_min": 0.047033630311489105, "rewards/margins_std": 0.0267783310264349, "rewards/rejected": -0.04578925296664238, "step": 610 }, { "epoch": 0.24, "grad_norm": 0.279296875, "learning_rate": 4.7263104503324927e-07, "logits/chosen": -1.4241613149642944, "logits/rejected": -1.0511661767959595, "logps/chosen": -229.7281036376953, "logps/rejected": -221.2598419189453, "loss": 0.6662, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022109119221568108, "rewards/margins": 0.05698208883404732, "rewards/margins_max": 0.08112742006778717, "rewards/margins_min": 0.03283676132559776, "rewards/margins_std": 0.03414664790034294, "rewards/rejected": -0.03487296774983406, "step": 620 }, { "epoch": 0.24, "grad_norm": 0.26953125, "learning_rate": 4.711026041190167e-07, "logits/chosen": -1.4864065647125244, "logits/rejected": -1.1408494710922241, "logps/chosen": -190.9454345703125, "logps/rejected": -185.9245147705078, "loss": 0.6632, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024640116840600967, "rewards/margins": 0.0617794506251812, "rewards/margins_max": 0.08624964207410812, "rewards/margins_min": 0.03730924800038338, "rewards/margins_std": 0.03460608795285225, "rewards/rejected": -0.03713933378458023, "step": 630 }, { "epoch": 0.24, "grad_norm": 0.318359375, "learning_rate": 4.6953524759527053e-07, "logits/chosen": -1.3665571212768555, "logits/rejected": -1.0415217876434326, "logps/chosen": -206.030029296875, "logps/rejected": -218.22738647460938, "loss": 0.6626, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02321634814143181, "rewards/margins": 0.06289292871952057, "rewards/margins_max": 0.08443962037563324, "rewards/margins_min": 0.041346240788698196, "rewards/margins_std": 0.030471617355942726, "rewards/rejected": -0.03967657685279846, "step": 640 }, { "epoch": 0.25, "grad_norm": 0.35546875, "learning_rate": 4.6792925132775486e-07, "logits/chosen": -1.3912900686264038, "logits/rejected": -1.2024117708206177, "logps/chosen": -189.44032287597656, "logps/rejected": -203.79727172851562, "loss": 0.6643, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0144128929823637, "rewards/margins": 0.05512215569615364, "rewards/margins_max": 0.08231507241725922, "rewards/margins_min": 0.02792922779917717, "rewards/margins_std": 0.03845660015940666, "rewards/rejected": -0.04070926457643509, "step": 650 }, { "epoch": 0.25, "grad_norm": 0.3515625, "learning_rate": 4.6628489798308004e-07, "logits/chosen": -1.3954074382781982, "logits/rejected": -1.1058709621429443, "logps/chosen": -217.58493041992188, "logps/rejected": -220.1082000732422, "loss": 0.6622, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.015280501917004585, "rewards/margins": 0.058963824063539505, "rewards/margins_max": 0.0854538083076477, "rewards/margins_min": 0.032473836094141006, "rewards/margins_std": 0.037462495267391205, "rewards/rejected": -0.04368331655859947, "step": 660 }, { "epoch": 0.25, "grad_norm": 0.3046875, "learning_rate": 4.64602476978971e-07, "logits/chosen": -1.3883717060089111, "logits/rejected": -1.0562824010849, "logps/chosen": -209.17697143554688, "logps/rejected": -229.61672973632812, "loss": 0.6604, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018770989030599594, "rewards/margins": 0.0635194405913353, "rewards/margins_max": 0.09025418758392334, "rewards/margins_min": 0.03678469732403755, "rewards/margins_std": 0.03780863806605339, "rewards/rejected": -0.044748447835445404, "step": 670 }, { "epoch": 0.26, "grad_norm": 0.306640625, "learning_rate": 4.6288228443332776e-07, "logits/chosen": -1.4112727642059326, "logits/rejected": -1.1717571020126343, "logps/chosen": -183.66964721679688, "logps/rejected": -186.0308380126953, "loss": 0.6619, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01850176230072975, "rewards/margins": 0.05745549127459526, "rewards/margins_max": 0.08195947110652924, "rewards/margins_min": 0.03295152261853218, "rewards/margins_std": 0.034653857350349426, "rewards/rejected": -0.038953740149736404, "step": 680 }, { "epoch": 0.26, "grad_norm": 0.306640625, "learning_rate": 4.6112462311210685e-07, "logits/chosen": -1.3851698637008667, "logits/rejected": -0.9565486907958984, "logps/chosen": -208.18746948242188, "logps/rejected": -205.7491455078125, "loss": 0.6615, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.021183792501688004, "rewards/margins": 0.06673257052898407, "rewards/margins_max": 0.09535098820924759, "rewards/margins_min": 0.038114141672849655, "rewards/margins_std": 0.04047255963087082, "rewards/rejected": -0.04554877430200577, "step": 690 }, { "epoch": 0.27, "grad_norm": 0.2890625, "learning_rate": 4.593298023760319e-07, "logits/chosen": -1.4321620464324951, "logits/rejected": -1.0346721410751343, "logps/chosen": -235.98095703125, "logps/rejected": -261.7032165527344, "loss": 0.6602, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.020307691767811775, "rewards/margins": 0.06724603474140167, "rewards/margins_max": 0.09406231343746185, "rewards/margins_min": 0.040429744869470596, "rewards/margins_std": 0.03792395070195198, "rewards/rejected": -0.04693833738565445, "step": 700 }, { "epoch": 0.27, "grad_norm": 0.279296875, "learning_rate": 4.5749813812614447e-07, "logits/chosen": -1.5499064922332764, "logits/rejected": -1.1074297428131104, "logps/chosen": -237.53274536132812, "logps/rejected": -226.1111297607422, "loss": 0.6626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020518099889159203, "rewards/margins": 0.05978738144040108, "rewards/margins_max": 0.08547325432300568, "rewards/margins_min": 0.034101493656635284, "rewards/margins_std": 0.036325328052043915, "rewards/rejected": -0.03926927596330643, "step": 710 }, { "epoch": 0.27, "grad_norm": 0.302734375, "learning_rate": 4.5562995274820283e-07, "logits/chosen": -1.4489339590072632, "logits/rejected": -1.154266595840454, "logps/chosen": -205.8553466796875, "logps/rejected": -199.05389404296875, "loss": 0.6577, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.019976060837507248, "rewards/margins": 0.07855883985757828, "rewards/margins_max": 0.11712227761745453, "rewards/margins_min": 0.039995402097702026, "rewards/margins_std": 0.05453693866729736, "rewards/rejected": -0.05858277156949043, "step": 720 }, { "epoch": 0.28, "grad_norm": 0.31640625, "learning_rate": 4.5372557505594024e-07, "logits/chosen": -1.4426929950714111, "logits/rejected": -1.1371030807495117, "logps/chosen": -228.83718872070312, "logps/rejected": -260.3121032714844, "loss": 0.6573, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024684404954314232, "rewards/margins": 0.07492948323488235, "rewards/margins_max": 0.1032097116112709, "rewards/margins_min": 0.04664924368262291, "rewards/margins_std": 0.039994291961193085, "rewards/rejected": -0.050245076417922974, "step": 730 }, { "epoch": 0.28, "grad_norm": 0.263671875, "learning_rate": 4.517853402331909e-07, "logits/chosen": -1.5174484252929688, "logits/rejected": -1.0977634191513062, "logps/chosen": -199.60836791992188, "logps/rejected": -209.7433319091797, "loss": 0.6595, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.026398539543151855, "rewards/margins": 0.07308533042669296, "rewards/margins_max": 0.10156605392694473, "rewards/margins_min": 0.04460462927818298, "rewards/margins_std": 0.04027780145406723, "rewards/rejected": -0.046686798334121704, "step": 740 }, { "epoch": 0.28, "grad_norm": 0.2890625, "learning_rate": 4.4980958977489594e-07, "logits/chosen": -1.416208267211914, "logits/rejected": -1.0870482921600342, "logps/chosen": -216.3725128173828, "logps/rejected": -215.8811492919922, "loss": 0.652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.020642448216676712, "rewards/margins": 0.07765364646911621, "rewards/margins_max": 0.11166071891784668, "rewards/margins_min": 0.043646566569805145, "rewards/margins_std": 0.0480932779610157, "rewards/rejected": -0.0570111945271492, "step": 750 }, { "epoch": 0.29, "grad_norm": 0.2734375, "learning_rate": 4.477986714269971e-07, "logits/chosen": -1.4371469020843506, "logits/rejected": -1.1657259464263916, "logps/chosen": -183.5517120361328, "logps/rejected": -202.76528930664062, "loss": 0.6589, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022578328847885132, "rewards/margins": 0.06701181083917618, "rewards/margins_max": 0.09141481667757034, "rewards/margins_min": 0.04260881245136261, "rewards/margins_std": 0.03451105207204819, "rewards/rejected": -0.044433485716581345, "step": 760 }, { "epoch": 0.29, "grad_norm": 0.26953125, "learning_rate": 4.457529391252317e-07, "logits/chosen": -1.4765751361846924, "logits/rejected": -1.1041843891143799, "logps/chosen": -217.7615966796875, "logps/rejected": -216.8796844482422, "loss": 0.6584, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021331623196601868, "rewards/margins": 0.0649794489145279, "rewards/margins_max": 0.09212762117385864, "rewards/margins_min": 0.03783128783106804, "rewards/margins_std": 0.0383933000266552, "rewards/rejected": -0.043647829443216324, "step": 770 }, { "epoch": 0.3, "grad_norm": 0.259765625, "learning_rate": 4.43672752932837e-07, "logits/chosen": -1.432063102722168, "logits/rejected": -1.1789019107818604, "logps/chosen": -194.9161834716797, "logps/rejected": -193.73593139648438, "loss": 0.6625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02206316404044628, "rewards/margins": 0.06179576367139816, "rewards/margins_max": 0.0931532233953476, "rewards/margins_min": 0.030438298359513283, "rewards/margins_std": 0.0443461537361145, "rewards/rejected": -0.03973260149359703, "step": 780 }, { "epoch": 0.3, "grad_norm": 0.296875, "learning_rate": 4.415584789771769e-07, "logits/chosen": -1.3746683597564697, "logits/rejected": -1.1170393228530884, "logps/chosen": -192.1763153076172, "logps/rejected": -205.6260223388672, "loss": 0.661, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.025766903534531593, "rewards/margins": 0.0719086080789566, "rewards/margins_max": 0.09772966802120209, "rewards/margins_min": 0.04608756676316261, "rewards/margins_std": 0.0365164689719677, "rewards/rejected": -0.04614170640707016, "step": 790 }, { "epoch": 0.3, "grad_norm": 0.279296875, "learning_rate": 4.394104893853007e-07, "logits/chosen": -1.4416863918304443, "logits/rejected": -1.0908607244491577, "logps/chosen": -219.8855743408203, "logps/rejected": -217.4703826904297, "loss": 0.6541, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02542579174041748, "rewards/margins": 0.08650486171245575, "rewards/margins_max": 0.1185513511300087, "rewards/margins_min": 0.0544583797454834, "rewards/margins_std": 0.04532057046890259, "rewards/rejected": -0.06107907369732857, "step": 800 }, { "epoch": 0.31, "grad_norm": 0.3203125, "learning_rate": 4.3722916221844613e-07, "logits/chosen": -1.3401496410369873, "logits/rejected": -0.9828430414199829, "logps/chosen": -213.7498779296875, "logps/rejected": -208.93325805664062, "loss": 0.6557, "rewards/accuracies": 1.0, "rewards/chosen": 0.028565192595124245, "rewards/margins": 0.08439178764820099, "rewards/margins_max": 0.11834411323070526, "rewards/margins_min": 0.05043945461511612, "rewards/margins_std": 0.04801584780216217, "rewards/rejected": -0.055826593190431595, "step": 810 }, { "epoch": 0.31, "grad_norm": 0.349609375, "learning_rate": 4.350148814054982e-07, "logits/chosen": -1.3076943159103394, "logits/rejected": -1.0068638324737549, "logps/chosen": -224.6240997314453, "logps/rejected": -238.9586181640625, "loss": 0.6556, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01850820705294609, "rewards/margins": 0.0732334703207016, "rewards/margins_max": 0.11072257906198502, "rewards/margins_min": 0.03574436157941818, "rewards/margins_std": 0.05301760509610176, "rewards/rejected": -0.05472525954246521, "step": 820 }, { "epoch": 0.32, "grad_norm": 0.2890625, "learning_rate": 4.327680366754146e-07, "logits/chosen": -1.356018304824829, "logits/rejected": -1.0702496767044067, "logps/chosen": -211.2010040283203, "logps/rejected": -209.5356903076172, "loss": 0.6569, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02133285626769066, "rewards/margins": 0.06990201771259308, "rewards/margins_max": 0.10147367417812347, "rewards/margins_min": 0.03833036497235298, "rewards/margins_std": 0.04464906454086304, "rewards/rejected": -0.04856916517019272, "step": 830 }, { "epoch": 0.32, "grad_norm": 0.318359375, "learning_rate": 4.3048902348863106e-07, "logits/chosen": -1.469026803970337, "logits/rejected": -1.1089591979980469, "logps/chosen": -224.88095092773438, "logps/rejected": -225.6546173095703, "loss": 0.6554, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025625556707382202, "rewards/margins": 0.0845932811498642, "rewards/margins_max": 0.11639096587896347, "rewards/margins_min": 0.05279559642076492, "rewards/margins_std": 0.04496871680021286, "rewards/rejected": -0.05896772816777229, "step": 840 }, { "epoch": 0.32, "grad_norm": 0.326171875, "learning_rate": 4.2817824296745736e-07, "logits/chosen": -1.410362958908081, "logits/rejected": -1.0984877347946167, "logps/chosen": -205.9351348876953, "logps/rejected": -216.6947021484375, "loss": 0.658, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022886861115694046, "rewards/margins": 0.0620783269405365, "rewards/margins_max": 0.09115530550479889, "rewards/margins_min": 0.03300134092569351, "rewards/margins_std": 0.04112106189131737, "rewards/rejected": -0.039191462099552155, "step": 850 }, { "epoch": 0.33, "grad_norm": 0.33203125, "learning_rate": 4.258361018254769e-07, "logits/chosen": -1.465014934539795, "logits/rejected": -1.1755095720291138, "logps/chosen": -192.93734741210938, "logps/rejected": -219.0324249267578, "loss": 0.6532, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025163287296891212, "rewards/margins": 0.0787445530295372, "rewards/margins_max": 0.11643560230731964, "rewards/margins_min": 0.041053496301174164, "rewards/margins_std": 0.05330319330096245, "rewards/rejected": -0.05358126014471054, "step": 860 }, { "epoch": 0.33, "grad_norm": 0.28515625, "learning_rate": 4.234630122959625e-07, "logits/chosen": -1.5218905210494995, "logits/rejected": -1.1900856494903564, "logps/chosen": -195.29864501953125, "logps/rejected": -254.084716796875, "loss": 0.6557, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021066388115286827, "rewards/margins": 0.0712776631116867, "rewards/margins_max": 0.09616719186306, "rewards/margins_min": 0.046388134360313416, "rewards/margins_std": 0.0351991131901741, "rewards/rejected": -0.05021127313375473, "step": 870 }, { "epoch": 0.33, "grad_norm": 0.703125, "learning_rate": 4.2105939205932005e-07, "logits/chosen": -1.3866618871688843, "logits/rejected": -1.1061947345733643, "logps/chosen": -197.81729125976562, "logps/rejected": -253.219970703125, "loss": 0.6515, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029097964987158775, "rewards/margins": 0.08818992227315903, "rewards/margins_max": 0.12071744352579117, "rewards/margins_min": 0.055662404745817184, "rewards/margins_std": 0.04600085690617561, "rewards/rejected": -0.0590919628739357, "step": 880 }, { "epoch": 0.34, "grad_norm": 0.34765625, "learning_rate": 4.1862566416957444e-07, "logits/chosen": -1.5151255130767822, "logits/rejected": -1.148018479347229, "logps/chosen": -202.05813598632812, "logps/rejected": -222.6027069091797, "loss": 0.6562, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.019906148314476013, "rewards/margins": 0.07728614658117294, "rewards/margins_max": 0.10957305133342743, "rewards/margins_min": 0.044999223202466965, "rewards/margins_std": 0.0456605963408947, "rewards/rejected": -0.05737999826669693, "step": 890 }, { "epoch": 0.34, "grad_norm": 0.306640625, "learning_rate": 4.161622569799085e-07, "logits/chosen": -1.3949564695358276, "logits/rejected": -1.0818382501602173, "logps/chosen": -195.73452758789062, "logps/rejected": -196.80213928222656, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 0.021286411210894585, "rewards/margins": 0.07778953015804291, "rewards/margins_max": 0.10992787778377533, "rewards/margins_min": 0.04565117880702019, "rewards/margins_std": 0.04545048624277115, "rewards/rejected": -0.056503117084503174, "step": 900 }, { "epoch": 0.35, "grad_norm": 0.267578125, "learning_rate": 4.136696040672702e-07, "logits/chosen": -1.3432233333587646, "logits/rejected": -0.9210837483406067, "logps/chosen": -227.01687622070312, "logps/rejected": -276.3375549316406, "loss": 0.6503, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02953980304300785, "rewards/margins": 0.09690040349960327, "rewards/margins_max": 0.13372206687927246, "rewards/margins_min": 0.06007874757051468, "rewards/margins_std": 0.052073679864406586, "rewards/rejected": -0.06736060231924057, "step": 910 }, { "epoch": 0.35, "grad_norm": 0.3125, "learning_rate": 4.1114814415605975e-07, "logits/chosen": -1.39051353931427, "logits/rejected": -1.1195346117019653, "logps/chosen": -172.82054138183594, "logps/rejected": -214.9634552001953, "loss": 0.6491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025750596076250076, "rewards/margins": 0.08932848274707794, "rewards/margins_max": 0.12272658199071884, "rewards/margins_min": 0.05593038722872734, "rewards/margins_std": 0.04723203927278519, "rewards/rejected": -0.06357789039611816, "step": 920 }, { "epoch": 0.35, "grad_norm": 0.33984375, "learning_rate": 4.0859832104091136e-07, "logits/chosen": -1.4840190410614014, "logits/rejected": -1.109261155128479, "logps/chosen": -211.09658813476562, "logps/rejected": -212.0129852294922, "loss": 0.6542, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0225338414311409, "rewards/margins": 0.0804629772901535, "rewards/margins_max": 0.1128251925110817, "rewards/margins_min": 0.048100754618644714, "rewards/margins_std": 0.04576708376407623, "rewards/rejected": -0.057929135859012604, "step": 930 }, { "epoch": 0.36, "grad_norm": 0.283203125, "learning_rate": 4.060205835085821e-07, "logits/chosen": -1.4287742376327515, "logits/rejected": -1.0760209560394287, "logps/chosen": -216.3826904296875, "logps/rejected": -228.55966186523438, "loss": 0.6575, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02253951132297516, "rewards/margins": 0.0758446678519249, "rewards/margins_max": 0.104688860476017, "rewards/margins_min": 0.0470004640519619, "rewards/margins_std": 0.04079186171293259, "rewards/rejected": -0.05330515652894974, "step": 940 }, { "epoch": 0.36, "grad_norm": 0.35546875, "learning_rate": 4.034153852589623e-07, "logits/chosen": -1.490159273147583, "logits/rejected": -1.1104148626327515, "logps/chosen": -192.82733154296875, "logps/rejected": -206.4297637939453, "loss": 0.6549, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02006385289132595, "rewards/margins": 0.07609236240386963, "rewards/margins_max": 0.112637460231781, "rewards/margins_min": 0.03954725340008736, "rewards/margins_std": 0.05168258026242256, "rewards/rejected": -0.056028496474027634, "step": 950 }, { "epoch": 0.36, "grad_norm": 0.33984375, "learning_rate": 4.0078318482522114e-07, "logits/chosen": -1.417443871498108, "logits/rejected": -1.0727176666259766, "logps/chosen": -222.9761962890625, "logps/rejected": -221.27786254882812, "loss": 0.6543, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.017086107283830643, "rewards/margins": 0.07355757057666779, "rewards/margins_max": 0.10742592811584473, "rewards/margins_min": 0.03968920186161995, "rewards/margins_std": 0.0478971004486084, "rewards/rejected": -0.056471455842256546, "step": 960 }, { "epoch": 0.37, "grad_norm": 0.291015625, "learning_rate": 3.9812444549310166e-07, "logits/chosen": -1.4559601545333862, "logits/rejected": -1.1964634656906128, "logps/chosen": -189.91392517089844, "logps/rejected": -199.15951538085938, "loss": 0.6502, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.025768781080842018, "rewards/margins": 0.08322307467460632, "rewards/margins_max": 0.11746567487716675, "rewards/margins_min": 0.048980481922626495, "rewards/margins_std": 0.04842633754014969, "rewards/rejected": -0.05745428800582886, "step": 970 }, { "epoch": 0.37, "grad_norm": 0.318359375, "learning_rate": 3.9543963521937915e-07, "logits/chosen": -1.3898035287857056, "logits/rejected": -1.138649821281433, "logps/chosen": -204.3951873779297, "logps/rejected": -227.2592010498047, "loss": 0.6544, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.015050689689815044, "rewards/margins": 0.072553351521492, "rewards/margins_max": 0.10220275074243546, "rewards/margins_min": 0.042903970927000046, "rewards/margins_std": 0.04193056747317314, "rewards/rejected": -0.05750266835093498, "step": 980 }, { "epoch": 0.38, "grad_norm": 0.3046875, "learning_rate": 3.927292265494978e-07, "logits/chosen": -1.3583369255065918, "logits/rejected": -1.1555811166763306, "logps/chosen": -182.11109924316406, "logps/rejected": -184.9093017578125, "loss": 0.6548, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020562218502163887, "rewards/margins": 0.06886212527751923, "rewards/margins_max": 0.10518188774585724, "rewards/margins_min": 0.032542359083890915, "rewards/margins_std": 0.051363904029130936, "rewards/rejected": -0.04829990863800049, "step": 990 }, { "epoch": 0.38, "grad_norm": 0.310546875, "learning_rate": 3.8999369653439883e-07, "logits/chosen": -1.4692192077636719, "logits/rejected": -1.1662304401397705, "logps/chosen": -218.901123046875, "logps/rejected": -259.073486328125, "loss": 0.6544, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021590515971183777, "rewards/margins": 0.07978875935077667, "rewards/margins_max": 0.11681132018566132, "rewards/margins_min": 0.04276617616415024, "rewards/margins_std": 0.05235783010721207, "rewards/rejected": -0.0581982359290123, "step": 1000 }, { "epoch": 0.38, "grad_norm": 0.431640625, "learning_rate": 3.872335266465565e-07, "logits/chosen": -1.449806571006775, "logits/rejected": -1.1126211881637573, "logps/chosen": -208.30508422851562, "logps/rejected": -209.28854370117188, "loss": 0.6526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01985124871134758, "rewards/margins": 0.09691493213176727, "rewards/margins_max": 0.13921746611595154, "rewards/margins_min": 0.05461239814758301, "rewards/margins_std": 0.059824805706739426, "rewards/rejected": -0.0770636796951294, "step": 1010 }, { "epoch": 0.39, "grad_norm": 0.2578125, "learning_rate": 3.8444920269523564e-07, "logits/chosen": -1.4464867115020752, "logits/rejected": -1.1247550249099731, "logps/chosen": -208.340087890625, "logps/rejected": -231.50064086914062, "loss": 0.6555, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02373802661895752, "rewards/margins": 0.07935117930173874, "rewards/margins_max": 0.11073676496744156, "rewards/margins_min": 0.04796561598777771, "rewards/margins_std": 0.04438590258359909, "rewards/rejected": -0.055613160133361816, "step": 1020 }, { "epoch": 0.39, "grad_norm": 0.3046875, "learning_rate": 3.8164121474098557e-07, "logits/chosen": -1.4539260864257812, "logits/rejected": -1.0766693353652954, "logps/chosen": -208.6253662109375, "logps/rejected": -219.2858428955078, "loss": 0.6552, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.015472279861569405, "rewards/margins": 0.07762763649225235, "rewards/margins_max": 0.10684994608163834, "rewards/margins_min": 0.04840531200170517, "rewards/margins_std": 0.04132659360766411, "rewards/rejected": -0.0621553435921669, "step": 1030 }, { "epoch": 0.4, "grad_norm": 0.2470703125, "learning_rate": 3.7881005700938627e-07, "logits/chosen": -1.4579023122787476, "logits/rejected": -1.067096471786499, "logps/chosen": -195.89137268066406, "logps/rejected": -208.55618286132812, "loss": 0.6504, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01986720971763134, "rewards/margins": 0.0889149010181427, "rewards/margins_max": 0.12377619743347168, "rewards/margins_min": 0.05405362695455551, "rewards/margins_std": 0.04930129647254944, "rewards/rejected": -0.0690477043390274, "step": 1040 }, { "epoch": 0.4, "grad_norm": 0.326171875, "learning_rate": 3.759562278040611e-07, "logits/chosen": -1.3401424884796143, "logits/rejected": -1.1433542966842651, "logps/chosen": -187.8769989013672, "logps/rejected": -207.67489624023438, "loss": 0.6523, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.019486157223582268, "rewards/margins": 0.08506642282009125, "rewards/margins_max": 0.1269386112689972, "rewards/margins_min": 0.043194226920604706, "rewards/margins_std": 0.059216223657131195, "rewards/rejected": -0.06558026373386383, "step": 1050 }, { "epoch": 0.4, "grad_norm": 0.353515625, "learning_rate": 3.7308022941897176e-07, "logits/chosen": -1.3987281322479248, "logits/rejected": -1.0789750814437866, "logps/chosen": -222.21798706054688, "logps/rejected": -221.2227783203125, "loss": 0.6494, "rewards/accuracies": 0.9375, "rewards/chosen": 0.021621376276016235, "rewards/margins": 0.09140492975711823, "rewards/margins_max": 0.1279296576976776, "rewards/margins_min": 0.05488022044301033, "rewards/margins_std": 0.05165375396609306, "rewards/rejected": -0.06978355348110199, "step": 1060 }, { "epoch": 0.41, "grad_norm": 0.29296875, "learning_rate": 3.7018256805001115e-07, "logits/chosen": -1.4328137636184692, "logits/rejected": -1.0926902294158936, "logps/chosen": -208.5886688232422, "logps/rejected": -234.0699462890625, "loss": 0.6478, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.025926152244210243, "rewards/margins": 0.09499989449977875, "rewards/margins_max": 0.1423562914133072, "rewards/margins_min": 0.047643501311540604, "rewards/margins_std": 0.06697206199169159, "rewards/rejected": -0.06907374411821365, "step": 1070 }, { "epoch": 0.41, "grad_norm": 0.310546875, "learning_rate": 3.6726375370590924e-07, "logits/chosen": -1.4664019346237183, "logits/rejected": -1.0370583534240723, "logps/chosen": -245.40286254882812, "logps/rejected": -201.60385131835938, "loss": 0.6554, "rewards/accuracies": 0.9375, "rewards/chosen": 0.016693558543920517, "rewards/margins": 0.07288383692502975, "rewards/margins_max": 0.10198800265789032, "rewards/margins_min": 0.043779678642749786, "rewards/margins_std": 0.041159503161907196, "rewards/rejected": -0.056190282106399536, "step": 1080 }, { "epoch": 0.41, "grad_norm": 0.353515625, "learning_rate": 3.6432430011846825e-07, "logits/chosen": -1.4667326211929321, "logits/rejected": -1.1126149892807007, "logps/chosen": -199.15652465820312, "logps/rejected": -224.69058227539062, "loss": 0.6495, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022613363340497017, "rewards/margins": 0.09035179764032364, "rewards/margins_max": 0.1315051019191742, "rewards/margins_min": 0.04919849708676338, "rewards/margins_std": 0.05819956213235855, "rewards/rejected": -0.06773844361305237, "step": 1090 }, { "epoch": 0.42, "grad_norm": 0.3359375, "learning_rate": 3.613647246521419e-07, "logits/chosen": -1.5111665725708008, "logits/rejected": -1.047363519668579, "logps/chosen": -213.94393920898438, "logps/rejected": -214.4349365234375, "loss": 0.6534, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.020266389474272728, "rewards/margins": 0.08451583981513977, "rewards/margins_max": 0.12752871215343475, "rewards/margins_min": 0.04150295630097389, "rewards/margins_std": 0.06082940101623535, "rewards/rejected": -0.0642494484782219, "step": 1100 }, { "epoch": 0.42, "grad_norm": 0.294921875, "learning_rate": 3.583855482129755e-07, "logits/chosen": -1.4186838865280151, "logits/rejected": -1.1392043828964233, "logps/chosen": -214.18783569335938, "logps/rejected": -266.89801025390625, "loss": 0.6486, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.024385053664445877, "rewards/margins": 0.09580737352371216, "rewards/margins_max": 0.1446331888437271, "rewards/margins_min": 0.0469815619289875, "rewards/margins_std": 0.0690501257777214, "rewards/rejected": -0.07142232358455658, "step": 1110 }, { "epoch": 0.43, "grad_norm": 0.328125, "learning_rate": 3.5538729515692354e-07, "logits/chosen": -1.4348571300506592, "logits/rejected": -1.005236268043518, "logps/chosen": -222.6161346435547, "logps/rejected": -215.0897674560547, "loss": 0.6544, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02178863435983658, "rewards/margins": 0.0858430564403534, "rewards/margins_max": 0.12585435807704926, "rewards/margins_min": 0.04583176225423813, "rewards/margins_std": 0.056584518402814865, "rewards/rejected": -0.06405442208051682, "step": 1120 }, { "epoch": 0.43, "grad_norm": 0.310546875, "learning_rate": 3.523704931975588e-07, "logits/chosen": -1.4166271686553955, "logits/rejected": -1.1025335788726807, "logps/chosen": -214.4542999267578, "logps/rejected": -231.10617065429688, "loss": 0.6537, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02049335464835167, "rewards/margins": 0.07652363926172256, "rewards/margins_max": 0.11191650480031967, "rewards/margins_min": 0.041130781173706055, "rewards/margins_std": 0.05005306005477905, "rewards/rejected": -0.056030284613370895, "step": 1130 }, { "epoch": 0.43, "grad_norm": 0.328125, "learning_rate": 3.4933567331319086e-07, "logits/chosen": -1.4324204921722412, "logits/rejected": -1.0857694149017334, "logps/chosen": -220.71957397460938, "logps/rejected": -210.15182495117188, "loss": 0.6499, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021782949566841125, "rewards/margins": 0.09863193333148956, "rewards/margins_max": 0.14462308585643768, "rewards/margins_min": 0.052640777081251144, "rewards/margins_std": 0.06504130363464355, "rewards/rejected": -0.07684897631406784, "step": 1140 }, { "epoch": 0.44, "grad_norm": 0.390625, "learning_rate": 3.46283369653411e-07, "logits/chosen": -1.4875332117080688, "logits/rejected": -1.1682199239730835, "logps/chosen": -225.19692993164062, "logps/rejected": -249.2867889404297, "loss": 0.6514, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020085861906409264, "rewards/margins": 0.08808033168315887, "rewards/margins_max": 0.12566521763801575, "rewards/margins_min": 0.05049543455243111, "rewards/margins_std": 0.05315307527780533, "rewards/rejected": -0.06799447536468506, "step": 1150 }, { "epoch": 0.44, "grad_norm": 0.63671875, "learning_rate": 3.4321411944507714e-07, "logits/chosen": -1.4519484043121338, "logits/rejected": -1.1154358386993408, "logps/chosen": -211.3180389404297, "logps/rejected": -290.20086669921875, "loss": 0.6523, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02073494717478752, "rewards/margins": 0.08044596761465073, "rewards/margins_max": 0.11118495464324951, "rewards/margins_min": 0.04970698431134224, "rewards/margins_std": 0.04347149282693863, "rewards/rejected": -0.059711016714572906, "step": 1160 }, { "epoch": 0.44, "grad_norm": 0.3203125, "learning_rate": 3.40128462897759e-07, "logits/chosen": -1.4702914953231812, "logits/rejected": -1.1601308584213257, "logps/chosen": -199.1900177001953, "logps/rejected": -213.91455078125, "loss": 0.6496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.014372904784977436, "rewards/margins": 0.0744035467505455, "rewards/margins_max": 0.10700450092554092, "rewards/margins_min": 0.04180259257555008, "rewards/margins_std": 0.04610472172498703, "rewards/rejected": -0.06003064662218094, "step": 1170 }, { "epoch": 0.45, "grad_norm": 0.373046875, "learning_rate": 3.3702694310865693e-07, "logits/chosen": -1.334912896156311, "logits/rejected": -1.1364924907684326, "logps/chosen": -185.46609497070312, "logps/rejected": -222.8370819091797, "loss": 0.6493, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029090750962495804, "rewards/margins": 0.09566773474216461, "rewards/margins_max": 0.1352817267179489, "rewards/margins_min": 0.05605371668934822, "rewards/margins_std": 0.05602266266942024, "rewards/rejected": -0.06657697260379791, "step": 1180 }, { "epoch": 0.45, "grad_norm": 0.275390625, "learning_rate": 3.339101059670131e-07, "logits/chosen": -1.3308570384979248, "logits/rejected": -1.011419653892517, "logps/chosen": -220.4107666015625, "logps/rejected": -240.52627563476562, "loss": 0.6521, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.01713314652442932, "rewards/margins": 0.08281738311052322, "rewards/margins_max": 0.12792006134986877, "rewards/margins_min": 0.037714701145887375, "rewards/margins_std": 0.06378481537103653, "rewards/rejected": -0.0656842291355133, "step": 1190 }, { "epoch": 0.46, "grad_norm": 0.26953125, "learning_rate": 3.3077850005803125e-07, "logits/chosen": -1.377722144126892, "logits/rejected": -1.073115587234497, "logps/chosen": -217.9221649169922, "logps/rejected": -207.5659637451172, "loss": 0.6558, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02081725373864174, "rewards/margins": 0.06937672197818756, "rewards/margins_max": 0.10580576956272125, "rewards/margins_min": 0.03294768184423447, "rewards/margins_std": 0.05151844024658203, "rewards/rejected": -0.048559464514255524, "step": 1200 }, { "epoch": 0.46, "grad_norm": 0.35546875, "learning_rate": 3.276326765663218e-07, "logits/chosen": -1.3020093441009521, "logits/rejected": -1.0328706502914429, "logps/chosen": -237.2034149169922, "logps/rejected": -226.58364868164062, "loss": 0.6534, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01786690205335617, "rewards/margins": 0.07763433456420898, "rewards/margins_max": 0.10978218168020248, "rewards/margins_min": 0.04548647999763489, "rewards/margins_std": 0.045463927090168, "rewards/rejected": -0.05976742506027222, "step": 1210 }, { "epoch": 0.46, "grad_norm": 0.376953125, "learning_rate": 3.244731891788893e-07, "logits/chosen": -1.4796888828277588, "logits/rejected": -1.1671749353408813, "logps/chosen": -199.0811767578125, "logps/rejected": -237.5238494873047, "loss": 0.6469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.032245419919490814, "rewards/margins": 0.09224997460842133, "rewards/margins_max": 0.12662890553474426, "rewards/margins_min": 0.05787103250622749, "rewards/margins_std": 0.04861915856599808, "rewards/rejected": -0.06000455096364021, "step": 1220 }, { "epoch": 0.47, "grad_norm": 0.359375, "learning_rate": 3.2130059398768005e-07, "logits/chosen": -1.3609169721603394, "logits/rejected": -1.0250881910324097, "logps/chosen": -213.1770782470703, "logps/rejected": -201.0860595703125, "loss": 0.652, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.014574935659766197, "rewards/margins": 0.08546517789363861, "rewards/margins_max": 0.12767310440540314, "rewards/margins_min": 0.04325725510716438, "rewards/margins_std": 0.059691011905670166, "rewards/rejected": -0.07089023292064667, "step": 1230 }, { "epoch": 0.47, "grad_norm": 0.251953125, "learning_rate": 3.1811544939170573e-07, "logits/chosen": -1.4496055841445923, "logits/rejected": -1.250016212463379, "logps/chosen": -197.13966369628906, "logps/rejected": -228.1790313720703, "loss": 0.6529, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.022412937134504318, "rewards/margins": 0.0773552656173706, "rewards/margins_max": 0.11442377418279648, "rewards/margins_min": 0.04028675705194473, "rewards/margins_std": 0.05242278426885605, "rewards/rejected": -0.054942332208156586, "step": 1240 }, { "epoch": 0.47, "grad_norm": 0.33984375, "learning_rate": 3.1491831599876105e-07, "logits/chosen": -1.4599530696868896, "logits/rejected": -1.1350712776184082, "logps/chosen": -198.54531860351562, "logps/rejected": -211.37203979492188, "loss": 0.6482, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024880778044462204, "rewards/margins": 0.10913392156362534, "rewards/margins_max": 0.1501014530658722, "rewards/margins_min": 0.06816640496253967, "rewards/margins_std": 0.05793682858347893, "rewards/rejected": -0.08425314724445343, "step": 1250 }, { "epoch": 0.48, "grad_norm": 0.33203125, "learning_rate": 3.117097565267534e-07, "logits/chosen": -1.4878017902374268, "logits/rejected": -1.150596022605896, "logps/chosen": -236.7520294189453, "logps/rejected": -269.3097839355469, "loss": 0.6456, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023586656898260117, "rewards/margins": 0.09756810963153839, "rewards/margins_max": 0.14421576261520386, "rewards/margins_min": 0.05092043802142143, "rewards/margins_std": 0.06596976518630981, "rewards/rejected": -0.07398144900798798, "step": 1260 }, { "epoch": 0.48, "grad_norm": 0.296875, "learning_rate": 3.0849033570466013e-07, "logits/chosen": -1.3854446411132812, "logits/rejected": -1.1156353950500488, "logps/chosen": -211.7381134033203, "logps/rejected": -230.3738250732422, "loss": 0.647, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.024476120248436928, "rewards/margins": 0.09338697046041489, "rewards/margins_max": 0.13921865820884705, "rewards/margins_min": 0.04755526781082153, "rewards/margins_std": 0.06481581181287766, "rewards/rejected": -0.06891084462404251, "step": 1270 }, { "epoch": 0.49, "grad_norm": 0.27734375, "learning_rate": 3.0526062017313247e-07, "logits/chosen": -1.379686951637268, "logits/rejected": -1.134813904762268, "logps/chosen": -190.8585968017578, "logps/rejected": -205.110595703125, "loss": 0.6482, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018528077751398087, "rewards/margins": 0.08439335227012634, "rewards/margins_max": 0.11601463705301285, "rewards/margins_min": 0.05277208238840103, "rewards/margins_std": 0.044719234108924866, "rewards/rejected": -0.06586527824401855, "step": 1280 }, { "epoch": 0.49, "grad_norm": 0.3203125, "learning_rate": 3.020211783847625e-07, "logits/chosen": -1.5027602910995483, "logits/rejected": -1.1443543434143066, "logps/chosen": -200.97781372070312, "logps/rejected": -213.7756805419922, "loss": 0.6525, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02603713609278202, "rewards/margins": 0.08934494107961655, "rewards/margins_max": 0.1271466463804245, "rewards/margins_min": 0.05154324695467949, "rewards/margins_std": 0.05345967411994934, "rewards/rejected": -0.06330780684947968, "step": 1290 }, { "epoch": 0.49, "grad_norm": 0.3046875, "learning_rate": 2.987725805040321e-07, "logits/chosen": -1.5109453201293945, "logits/rejected": -1.1658785343170166, "logps/chosen": -194.1292266845703, "logps/rejected": -226.944091796875, "loss": 0.65, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.020652558654546738, "rewards/margins": 0.08578969538211823, "rewards/margins_max": 0.12059824168682098, "rewards/margins_min": 0.05098114535212517, "rewards/margins_std": 0.049226727336645126, "rewards/rejected": -0.06513713300228119, "step": 1300 }, { "epoch": 0.5, "grad_norm": 0.291015625, "learning_rate": 2.955153983069593e-07, "logits/chosen": -1.3911088705062866, "logits/rejected": -1.0021635293960571, "logps/chosen": -216.1819305419922, "logps/rejected": -217.72048950195312, "loss": 0.6533, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009969010017812252, "rewards/margins": 0.0843484178185463, "rewards/margins_max": 0.11825823783874512, "rewards/margins_min": 0.050438590347766876, "rewards/margins_std": 0.04795572906732559, "rewards/rejected": -0.07437939941883087, "step": 1310 }, { "epoch": 0.5, "grad_norm": 0.341796875, "learning_rate": 2.922502050804623e-07, "logits/chosen": -1.3908171653747559, "logits/rejected": -1.1572504043579102, "logps/chosen": -209.4795379638672, "logps/rejected": -240.86074829101562, "loss": 0.6489, "rewards/accuracies": 1.0, "rewards/chosen": 0.02364761009812355, "rewards/margins": 0.10014495998620987, "rewards/margins_max": 0.13577914237976074, "rewards/margins_min": 0.0645107850432396, "rewards/margins_std": 0.05039433762431145, "rewards/rejected": -0.07649735361337662, "step": 1320 }, { "epoch": 0.51, "grad_norm": 0.330078125, "learning_rate": 2.889775755214565e-07, "logits/chosen": -1.4878746271133423, "logits/rejected": -1.0873339176177979, "logps/chosen": -208.18862915039062, "logps/rejected": -243.1056671142578, "loss": 0.6486, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02549285627901554, "rewards/margins": 0.08646980673074722, "rewards/margins_max": 0.12378053367137909, "rewards/margins_min": 0.04915907606482506, "rewards/margins_std": 0.05276532843708992, "rewards/rejected": -0.060976944863796234, "step": 1330 }, { "epoch": 0.51, "grad_norm": 0.279296875, "learning_rate": 2.8569808563570406e-07, "logits/chosen": -1.4435174465179443, "logits/rejected": -1.0699201822280884, "logps/chosen": -195.5081024169922, "logps/rejected": -198.16644287109375, "loss": 0.6521, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02266775444149971, "rewards/margins": 0.07937842607498169, "rewards/margins_max": 0.11254648864269257, "rewards/margins_min": 0.046210356056690216, "rewards/margins_std": 0.0469067320227623, "rewards/rejected": -0.056710679084062576, "step": 1340 }, { "epoch": 0.51, "grad_norm": 0.2734375, "learning_rate": 2.8241231263643284e-07, "logits/chosen": -1.456235408782959, "logits/rejected": -1.1784732341766357, "logps/chosen": -213.12203979492188, "logps/rejected": -240.123046875, "loss": 0.6476, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01615014672279358, "rewards/margins": 0.09304296225309372, "rewards/margins_max": 0.13617947697639465, "rewards/margins_min": 0.04990645498037338, "rewards/margins_std": 0.06100423261523247, "rewards/rejected": -0.07689281553030014, "step": 1350 }, { "epoch": 0.52, "grad_norm": 0.291015625, "learning_rate": 2.791208348427426e-07, "logits/chosen": -1.4464397430419922, "logits/rejected": -1.1199182271957397, "logps/chosen": -214.69924926757812, "logps/rejected": -198.94326782226562, "loss": 0.6532, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01893942430615425, "rewards/margins": 0.0802186131477356, "rewards/margins_max": 0.11644063889980316, "rewards/margins_min": 0.04399657994508743, "rewards/margins_std": 0.05122567340731621, "rewards/rejected": -0.061279188841581345, "step": 1360 }, { "epoch": 0.52, "grad_norm": 0.265625, "learning_rate": 2.758242315778172e-07, "logits/chosen": -1.3660023212432861, "logits/rejected": -0.9832341074943542, "logps/chosen": -235.3618927001953, "logps/rejected": -201.94151306152344, "loss": 0.6526, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01969451829791069, "rewards/margins": 0.08969944715499878, "rewards/margins_max": 0.12913790345191956, "rewards/margins_min": 0.050260983407497406, "rewards/margins_std": 0.05577441304922104, "rewards/rejected": -0.07000492513179779, "step": 1370 }, { "epoch": 0.52, "grad_norm": 0.302734375, "learning_rate": 2.725230830669591e-07, "logits/chosen": -1.3614161014556885, "logits/rejected": -1.0590187311172485, "logps/chosen": -206.86636352539062, "logps/rejected": -194.6129150390625, "loss": 0.6539, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01725422963500023, "rewards/margins": 0.07537270337343216, "rewards/margins_max": 0.11255017668008804, "rewards/margins_min": 0.03819523751735687, "rewards/margins_std": 0.052576880902051926, "rewards/rejected": -0.05811848118901253, "step": 1380 }, { "epoch": 0.53, "grad_norm": 0.3828125, "learning_rate": 2.6921797033546604e-07, "logits/chosen": -1.4232187271118164, "logits/rejected": -1.0422379970550537, "logps/chosen": -225.4199676513672, "logps/rejected": -247.0299072265625, "loss": 0.6472, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023638783022761345, "rewards/margins": 0.10305992513895035, "rewards/margins_max": 0.15439125895500183, "rewards/margins_min": 0.05172859877347946, "rewards/margins_std": 0.07259346544742584, "rewards/rejected": -0.07942114025354385, "step": 1390 }, { "epoch": 0.53, "grad_norm": 0.330078125, "learning_rate": 2.6590947510636656e-07, "logits/chosen": -1.4895226955413818, "logits/rejected": -1.1499069929122925, "logps/chosen": -224.0960693359375, "logps/rejected": -234.5597686767578, "loss": 0.6522, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01984899304807186, "rewards/margins": 0.0820828229188919, "rewards/margins_max": 0.12106503546237946, "rewards/margins_min": 0.04310062155127525, "rewards/margins_std": 0.05512915924191475, "rewards/rejected": -0.062233828008174896, "step": 1400 }, { "epoch": 0.54, "grad_norm": 0.322265625, "learning_rate": 2.625981796980323e-07, "logits/chosen": -1.4586188793182373, "logits/rejected": -1.1192744970321655, "logps/chosen": -245.0766143798828, "logps/rejected": -215.29385375976562, "loss": 0.6537, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01640249788761139, "rewards/margins": 0.08490737527608871, "rewards/margins_max": 0.12561504542827606, "rewards/margins_min": 0.04419969767332077, "rewards/margins_std": 0.05756935477256775, "rewards/rejected": -0.06850487738847733, "step": 1410 }, { "epoch": 0.54, "grad_norm": 0.2734375, "learning_rate": 2.5928466692168616e-07, "logits/chosen": -1.4357595443725586, "logits/rejected": -1.1410107612609863, "logps/chosen": -186.00540161132812, "logps/rejected": -211.6009063720703, "loss": 0.6491, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022374480962753296, "rewards/margins": 0.09253247827291489, "rewards/margins_max": 0.13468560576438904, "rewards/margins_min": 0.05037938430905342, "rewards/margins_std": 0.059613488614559174, "rewards/rejected": -0.07015800476074219, "step": 1420 }, { "epoch": 0.54, "grad_norm": 0.318359375, "learning_rate": 2.559695199788234e-07, "logits/chosen": -1.3736810684204102, "logits/rejected": -1.1410489082336426, "logps/chosen": -206.69204711914062, "logps/rejected": -219.3141326904297, "loss": 0.6496, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.019356805831193924, "rewards/margins": 0.0815727710723877, "rewards/margins_max": 0.11693547666072845, "rewards/margins_min": 0.046210046857595444, "rewards/margins_std": 0.05001043155789375, "rewards/rejected": -0.06221596151590347, "step": 1430 }, { "epoch": 0.55, "grad_norm": 0.33984375, "learning_rate": 2.526533223585641e-07, "logits/chosen": -1.556718111038208, "logits/rejected": -1.207423448562622, "logps/chosen": -178.8986358642578, "logps/rejected": -196.97682189941406, "loss": 0.6522, "rewards/accuracies": 1.0, "rewards/chosen": 0.01927010528743267, "rewards/margins": 0.08486685901880264, "rewards/margins_max": 0.12347618490457535, "rewards/margins_min": 0.046257536858320236, "rewards/margins_std": 0.054601818323135376, "rewards/rejected": -0.06559675186872482, "step": 1440 }, { "epoch": 0.55, "grad_norm": 0.318359375, "learning_rate": 2.4933665773495464e-07, "logits/chosen": -1.4177541732788086, "logits/rejected": -1.0632787942886353, "logps/chosen": -210.5023956298828, "logps/rejected": -220.0447998046875, "loss": 0.6456, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022884273901581764, "rewards/margins": 0.08784890919923782, "rewards/margins_max": 0.12712661921977997, "rewards/margins_min": 0.04857120290398598, "rewards/margins_std": 0.055547066032886505, "rewards/rejected": -0.06496462970972061, "step": 1450 }, { "epoch": 0.55, "grad_norm": 0.330078125, "learning_rate": 2.460201098642378e-07, "logits/chosen": -1.4159841537475586, "logits/rejected": -1.0748342275619507, "logps/chosen": -199.53170776367188, "logps/rejected": -206.43692016601562, "loss": 0.6423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.020755691453814507, "rewards/margins": 0.10842639207839966, "rewards/margins_max": 0.15878863632678986, "rewards/margins_min": 0.05806415155529976, "rewards/margins_std": 0.07122296094894409, "rewards/rejected": -0.08767069876194, "step": 1460 }, { "epoch": 0.56, "grad_norm": 0.3203125, "learning_rate": 2.4270426248210635e-07, "logits/chosen": -1.4016398191452026, "logits/rejected": -1.0987274646759033, "logps/chosen": -203.04690551757812, "logps/rejected": -224.54849243164062, "loss": 0.6527, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.018762022256851196, "rewards/margins": 0.08783960342407227, "rewards/margins_max": 0.1270819753408432, "rewards/margins_min": 0.04859720915555954, "rewards/margins_std": 0.05549710988998413, "rewards/rejected": -0.06907757371664047, "step": 1470 }, { "epoch": 0.56, "grad_norm": 0.306640625, "learning_rate": 2.3938969920096296e-07, "logits/chosen": -1.4343984127044678, "logits/rejected": -1.121087670326233, "logps/chosen": -222.9105987548828, "logps/rejected": -221.96896362304688, "loss": 0.6503, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021892910823225975, "rewards/margins": 0.08273215591907501, "rewards/margins_max": 0.12083101272583008, "rewards/margins_min": 0.04463329166173935, "rewards/margins_std": 0.05387992784380913, "rewards/rejected": -0.060839246958494186, "step": 1480 }, { "epoch": 0.57, "grad_norm": 0.32421875, "learning_rate": 2.3607700340719872e-07, "logits/chosen": -1.4712364673614502, "logits/rejected": -1.1130168437957764, "logps/chosen": -204.46397399902344, "logps/rejected": -216.9080047607422, "loss": 0.6507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02178640104830265, "rewards/margins": 0.08826582878828049, "rewards/margins_max": 0.1252831369638443, "rewards/margins_min": 0.05124853178858757, "rewards/margins_std": 0.052350372076034546, "rewards/rejected": -0.06647942960262299, "step": 1490 }, { "epoch": 0.57, "grad_norm": 0.328125, "learning_rate": 2.3276675815851439e-07, "logits/chosen": -1.3515491485595703, "logits/rejected": -1.1509690284729004, "logps/chosen": -184.31765747070312, "logps/rejected": -215.8578643798828, "loss": 0.6516, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024958888068795204, "rewards/margins": 0.09194238483905792, "rewards/margins_max": 0.13820824027061462, "rewards/margins_min": 0.04567654803395271, "rewards/margins_std": 0.06542977690696716, "rewards/rejected": -0.06698349863290787, "step": 1500 }, { "epoch": 0.57, "grad_norm": 0.3203125, "learning_rate": 2.2945954608129725e-07, "logits/chosen": -1.4680145978927612, "logits/rejected": -1.1459739208221436, "logps/chosen": -220.7042236328125, "logps/rejected": -237.5146942138672, "loss": 0.6523, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.018372472375631332, "rewards/margins": 0.08562228828668594, "rewards/margins_max": 0.12245050817728043, "rewards/margins_min": 0.048794087022542953, "rewards/margins_std": 0.052082955837249756, "rewards/rejected": -0.0672498270869255, "step": 1510 }, { "epoch": 0.58, "grad_norm": 0.259765625, "learning_rate": 2.261559492680755e-07, "logits/chosen": -1.5185743570327759, "logits/rejected": -1.0897482633590698, "logps/chosen": -221.81869506835938, "logps/rejected": -223.1243896484375, "loss": 0.6469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024064363911747932, "rewards/margins": 0.10357926785945892, "rewards/margins_max": 0.14326122403144836, "rewards/margins_min": 0.06389732658863068, "rewards/margins_std": 0.05611874908208847, "rewards/rejected": -0.07951490581035614, "step": 1520 }, { "epoch": 0.58, "grad_norm": 0.400390625, "learning_rate": 2.2285654917506511e-07, "logits/chosen": -1.4201505184173584, "logits/rejected": -1.1784555912017822, "logps/chosen": -197.08529663085938, "logps/rejected": -233.5238037109375, "loss": 0.6519, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.014250082895159721, "rewards/margins": 0.0851346030831337, "rewards/margins_max": 0.12699416279792786, "rewards/margins_min": 0.04327503591775894, "rewards/margins_std": 0.05919836089015007, "rewards/rejected": -0.07088451087474823, "step": 1530 }, { "epoch": 0.59, "grad_norm": 0.314453125, "learning_rate": 2.1956192651983025e-07, "logits/chosen": -1.4901044368743896, "logits/rejected": -1.2225561141967773, "logps/chosen": -205.6460418701172, "logps/rejected": -211.71597290039062, "loss": 0.6483, "rewards/accuracies": 0.9375, "rewards/chosen": 0.014587330631911755, "rewards/margins": 0.08248572051525116, "rewards/margins_max": 0.11513223499059677, "rewards/margins_min": 0.049839213490486145, "rewards/margins_std": 0.04616914689540863, "rewards/rejected": -0.06789840012788773, "step": 1540 }, { "epoch": 0.59, "grad_norm": 0.30859375, "learning_rate": 2.1627266117907206e-07, "logits/chosen": -1.4423274993896484, "logits/rejected": -1.1351933479309082, "logps/chosen": -192.0325927734375, "logps/rejected": -214.21127319335938, "loss": 0.6511, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.019394617527723312, "rewards/margins": 0.08463616669178009, "rewards/margins_max": 0.1280382126569748, "rewards/margins_min": 0.04123411327600479, "rewards/margins_std": 0.06137976795434952, "rewards/rejected": -0.06524154543876648, "step": 1550 }, { "epoch": 0.59, "grad_norm": 0.328125, "learning_rate": 2.1298933208656715e-07, "logits/chosen": -1.4492504596710205, "logits/rejected": -1.0848209857940674, "logps/chosen": -213.02944946289062, "logps/rejected": -244.51010131835938, "loss": 0.6513, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01053069718182087, "rewards/margins": 0.08705185353755951, "rewards/margins_max": 0.12572325766086578, "rewards/margins_min": 0.04838045313954353, "rewards/margins_std": 0.05468962341547012, "rewards/rejected": -0.07652115821838379, "step": 1560 }, { "epoch": 0.6, "grad_norm": 0.390625, "learning_rate": 2.0971251713127064e-07, "logits/chosen": -1.4406160116195679, "logits/rejected": -1.0886666774749756, "logps/chosen": -214.84988403320312, "logps/rejected": -218.6672821044922, "loss": 0.6469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.023885857313871384, "rewards/margins": 0.09796186536550522, "rewards/margins_max": 0.13213284313678741, "rewards/margins_min": 0.06379088014364243, "rewards/margins_std": 0.048325065523386, "rewards/rejected": -0.07407601177692413, "step": 1570 }, { "epoch": 0.6, "grad_norm": 0.333984375, "learning_rate": 2.0644279305560378e-07, "logits/chosen": -1.3928980827331543, "logits/rejected": -1.1537230014801025, "logps/chosen": -209.4993438720703, "logps/rejected": -224.5998077392578, "loss": 0.6523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.024306269362568855, "rewards/margins": 0.07895907014608383, "rewards/margins_max": 0.1160157099366188, "rewards/margins_min": 0.04190244525671005, "rewards/margins_std": 0.05240599066019058, "rewards/rejected": -0.054652802646160126, "step": 1580 }, { "epoch": 0.6, "grad_norm": 0.29296875, "learning_rate": 2.0318073535394322e-07, "logits/chosen": -1.330426812171936, "logits/rejected": -1.1868915557861328, "logps/chosen": -209.96444702148438, "logps/rejected": -228.32839965820312, "loss": 0.6491, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021598026156425476, "rewards/margins": 0.09060736745595932, "rewards/margins_max": 0.1294335275888443, "rewards/margins_min": 0.051781199872493744, "rewards/margins_std": 0.054908476769924164, "rewards/rejected": -0.06900934129953384, "step": 1590 }, { "epoch": 0.61, "grad_norm": 0.33984375, "learning_rate": 1.9992691817133024e-07, "logits/chosen": -1.3735512495040894, "logits/rejected": -1.0827502012252808, "logps/chosen": -204.10800170898438, "logps/rejected": -213.89306640625, "loss": 0.6488, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.021180730313062668, "rewards/margins": 0.08791421353816986, "rewards/margins_max": 0.13012385368347168, "rewards/margins_min": 0.04570458456873894, "rewards/margins_std": 0.059693437069654465, "rewards/rejected": -0.06673348695039749, "step": 1600 }, { "epoch": 0.61, "grad_norm": 0.296875, "learning_rate": 1.9668191420241654e-07, "logits/chosen": -1.411036491394043, "logits/rejected": -1.0531718730926514, "logps/chosen": -196.48928833007812, "logps/rejected": -218.87850952148438, "loss": 0.6484, "rewards/accuracies": 1.0, "rewards/chosen": 0.02457505650818348, "rewards/margins": 0.09790615737438202, "rewards/margins_max": 0.13226325809955597, "rewards/margins_min": 0.06354905664920807, "rewards/margins_std": 0.04858827963471413, "rewards/rejected": -0.07333110272884369, "step": 1610 }, { "epoch": 0.62, "grad_norm": 0.3046875, "learning_rate": 1.9344629459066676e-07, "logits/chosen": -1.3850080966949463, "logits/rejected": -1.1197946071624756, "logps/chosen": -199.50949096679688, "logps/rejected": -246.0616455078125, "loss": 0.6489, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02376171015202999, "rewards/margins": 0.08920769393444061, "rewards/margins_max": 0.12311413139104843, "rewards/margins_min": 0.055301256477832794, "rewards/margins_std": 0.04795095697045326, "rewards/rejected": -0.06544599682092667, "step": 1620 }, { "epoch": 0.62, "grad_norm": 0.357421875, "learning_rate": 1.902206288278326e-07, "logits/chosen": -1.5406492948532104, "logits/rejected": -1.1824986934661865, "logps/chosen": -202.54025268554688, "logps/rejected": -201.29934692382812, "loss": 0.6529, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01288673747330904, "rewards/margins": 0.07875024527311325, "rewards/margins_max": 0.1176532506942749, "rewards/margins_min": 0.039847247302532196, "rewards/margins_std": 0.055017150938510895, "rewards/rejected": -0.06586351245641708, "step": 1630 }, { "epoch": 0.62, "grad_norm": 0.279296875, "learning_rate": 1.8700548465371873e-07, "logits/chosen": -1.4827055931091309, "logits/rejected": -1.1093571186065674, "logps/chosen": -223.68710327148438, "logps/rejected": -239.01846313476562, "loss": 0.6496, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01819652132689953, "rewards/margins": 0.08910879492759705, "rewards/margins_max": 0.12292404472827911, "rewards/margins_min": 0.05529356002807617, "rewards/margins_std": 0.04782196134328842, "rewards/rejected": -0.07091227173805237, "step": 1640 }, { "epoch": 0.63, "grad_norm": 0.28515625, "learning_rate": 1.8380142795625613e-07, "logits/chosen": -1.3712780475616455, "logits/rejected": -1.0752792358398438, "logps/chosen": -202.42001342773438, "logps/rejected": -208.89230346679688, "loss": 0.6503, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02562437392771244, "rewards/margins": 0.08713234215974808, "rewards/margins_max": 0.1250787079334259, "rewards/margins_min": 0.04918598383665085, "rewards/margins_std": 0.053664255887269974, "rewards/rejected": -0.06150797754526138, "step": 1650 }, { "epoch": 0.63, "grad_norm": 0.28125, "learning_rate": 1.8060902267190248e-07, "logits/chosen": -1.3778380155563354, "logits/rejected": -1.0215446949005127, "logps/chosen": -221.0638885498047, "logps/rejected": -214.5088348388672, "loss": 0.6518, "rewards/accuracies": 1.0, "rewards/chosen": 0.02021961286664009, "rewards/margins": 0.08468443900346756, "rewards/margins_max": 0.11928977817296982, "rewards/margins_min": 0.050079114735126495, "rewards/margins_std": 0.04893932491540909, "rewards/rejected": -0.06446482241153717, "step": 1660 }, { "epoch": 0.63, "grad_norm": 0.326171875, "learning_rate": 1.7742883068638445e-07, "logits/chosen": -1.3894107341766357, "logits/rejected": -1.1386574506759644, "logps/chosen": -192.35226440429688, "logps/rejected": -209.70242309570312, "loss": 0.6478, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02140471711754799, "rewards/margins": 0.09159474074840546, "rewards/margins_max": 0.12955918908119202, "rewards/margins_min": 0.05363030359148979, "rewards/margins_std": 0.05368983745574951, "rewards/rejected": -0.07019002735614777, "step": 1670 }, { "epoch": 0.64, "grad_norm": 0.294921875, "learning_rate": 1.742614117358029e-07, "logits/chosen": -1.3801229000091553, "logits/rejected": -1.077549934387207, "logps/chosen": -200.5829620361328, "logps/rejected": -208.20291137695312, "loss": 0.6522, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018527880311012268, "rewards/margins": 0.09335903823375702, "rewards/margins_max": 0.13055115938186646, "rewards/margins_min": 0.05616689473390579, "rewards/margins_std": 0.05259762331843376, "rewards/rejected": -0.07483114302158356, "step": 1680 }, { "epoch": 0.64, "grad_norm": 0.30078125, "learning_rate": 1.7110732330811488e-07, "logits/chosen": -1.3090708255767822, "logits/rejected": -1.0196329355239868, "logps/chosen": -216.028076171875, "logps/rejected": -259.5859680175781, "loss": 0.6511, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.019178880378603935, "rewards/margins": 0.08465974032878876, "rewards/margins_max": 0.11861772835254669, "rewards/margins_min": 0.05070176720619202, "rewards/margins_std": 0.04802383482456207, "rewards/rejected": -0.06548087298870087, "step": 1690 }, { "epoch": 0.65, "grad_norm": 0.345703125, "learning_rate": 1.6796712054501167e-07, "logits/chosen": -1.3992605209350586, "logits/rejected": -1.054147720336914, "logps/chosen": -210.8378448486328, "logps/rejected": -222.1731719970703, "loss": 0.652, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021352563053369522, "rewards/margins": 0.08200518041849136, "rewards/margins_max": 0.12033899873495102, "rewards/margins_min": 0.04367135837674141, "rewards/margins_std": 0.054212212562561035, "rewards/rejected": -0.06065262109041214, "step": 1700 }, { "epoch": 0.65, "grad_norm": 0.39453125, "learning_rate": 1.6484135614421036e-07, "logits/chosen": -1.3646801710128784, "logits/rejected": -1.1284492015838623, "logps/chosen": -202.224609375, "logps/rejected": -231.50820922851562, "loss": 0.6486, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.020144078880548477, "rewards/margins": 0.09865118563175201, "rewards/margins_max": 0.12845909595489502, "rewards/margins_min": 0.0688432827591896, "rewards/margins_std": 0.042154744267463684, "rewards/rejected": -0.07850711047649384, "step": 1710 }, { "epoch": 0.65, "grad_norm": 0.2294921875, "learning_rate": 1.617305802621748e-07, "logits/chosen": -1.4775947332382202, "logits/rejected": -1.182840347290039, "logps/chosen": -226.96572875976562, "logps/rejected": -248.0793914794922, "loss": 0.6549, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022877058014273643, "rewards/margins": 0.08480893075466156, "rewards/margins_max": 0.12495218217372894, "rewards/margins_min": 0.04466567188501358, "rewards/margins_std": 0.05677112936973572, "rewards/rejected": -0.06193187087774277, "step": 1720 }, { "epoch": 0.66, "grad_norm": 0.3359375, "learning_rate": 1.586353404172846e-07, "logits/chosen": -1.5373847484588623, "logits/rejected": -1.1060173511505127, "logps/chosen": -220.9475860595703, "logps/rejected": -232.4903106689453, "loss": 0.6531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.018491679802536964, "rewards/margins": 0.09102120250463486, "rewards/margins_max": 0.1361791342496872, "rewards/margins_min": 0.04586326703429222, "rewards/margins_std": 0.06386296451091766, "rewards/rejected": -0.07252952456474304, "step": 1730 }, { "epoch": 0.66, "grad_norm": 0.27734375, "learning_rate": 1.5555618139346762e-07, "logits/chosen": -1.4408385753631592, "logits/rejected": -1.170309066772461, "logps/chosen": -208.6848602294922, "logps/rejected": -236.5205841064453, "loss": 0.6454, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0164918415248394, "rewards/margins": 0.08608251065015793, "rewards/margins_max": 0.11366148293018341, "rewards/margins_min": 0.058503538370132446, "rewards/margins_std": 0.03900256007909775, "rewards/rejected": -0.06959067285060883, "step": 1740 }, { "epoch": 0.66, "grad_norm": 0.30859375, "learning_rate": 1.5249364514431467e-07, "logits/chosen": -1.430936574935913, "logits/rejected": -1.2513033151626587, "logps/chosen": -195.7074432373047, "logps/rejected": -222.7970428466797, "loss": 0.6525, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014837482944130898, "rewards/margins": 0.08443491905927658, "rewards/margins_max": 0.11733835935592651, "rewards/margins_min": 0.05153145641088486, "rewards/margins_std": 0.046532515436410904, "rewards/rejected": -0.06959743797779083, "step": 1750 }, { "epoch": 0.67, "grad_norm": 0.3671875, "learning_rate": 1.4944827069769122e-07, "logits/chosen": -1.3783493041992188, "logits/rejected": -1.0512521266937256, "logps/chosen": -195.98391723632812, "logps/rejected": -220.15029907226562, "loss": 0.6491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023444708436727524, "rewards/margins": 0.09378419816493988, "rewards/margins_max": 0.13183923065662384, "rewards/margins_min": 0.055729180574417114, "rewards/margins_std": 0.05381792038679123, "rewards/rejected": -0.07033950090408325, "step": 1760 }, { "epoch": 0.67, "grad_norm": 0.31640625, "learning_rate": 1.4642059406086543e-07, "logits/chosen": -1.4898409843444824, "logits/rejected": -1.1888186931610107, "logps/chosen": -205.3476104736328, "logps/rejected": -194.2181396484375, "loss": 0.6518, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015554072335362434, "rewards/margins": 0.08319921046495438, "rewards/margins_max": 0.12011650949716568, "rewards/margins_min": 0.046281903982162476, "rewards/margins_std": 0.052208948880434036, "rewards/rejected": -0.0676451325416565, "step": 1770 }, { "epoch": 0.68, "grad_norm": 0.34765625, "learning_rate": 1.4341114812616648e-07, "logits/chosen": -1.44392991065979, "logits/rejected": -1.0090781450271606, "logps/chosen": -229.25161743164062, "logps/rejected": -213.78305053710938, "loss": 0.6496, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.018382510170340538, "rewards/margins": 0.08579520881175995, "rewards/margins_max": 0.1280362457036972, "rewards/margins_min": 0.043554168194532394, "rewards/margins_std": 0.05973784998059273, "rewards/rejected": -0.06741269677877426, "step": 1780 }, { "epoch": 0.68, "grad_norm": 0.298828125, "learning_rate": 1.404204625771926e-07, "logits/chosen": -1.4878530502319336, "logits/rejected": -1.113875389099121, "logps/chosen": -214.8065185546875, "logps/rejected": -246.6589813232422, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 0.02722371183335781, "rewards/margins": 0.09628230333328247, "rewards/margins_max": 0.13469961285591125, "rewards/margins_min": 0.05786500126123428, "rewards/margins_std": 0.05433027073740959, "rewards/rejected": -0.06905858218669891, "step": 1790 }, { "epoch": 0.68, "grad_norm": 0.3359375, "learning_rate": 1.3744906379558164e-07, "logits/chosen": -1.4668910503387451, "logits/rejected": -1.199731469154358, "logps/chosen": -196.6200714111328, "logps/rejected": -210.1691131591797, "loss": 0.647, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.023409178480505943, "rewards/margins": 0.09194588661193848, "rewards/margins_max": 0.13833829760551453, "rewards/margins_min": 0.04555344209074974, "rewards/margins_std": 0.06560881435871124, "rewards/rejected": -0.06853669881820679, "step": 1800 }, { "epoch": 0.69, "grad_norm": 0.314453125, "learning_rate": 1.3449747476836602e-07, "logits/chosen": -1.5464714765548706, "logits/rejected": -1.2193983793258667, "logps/chosen": -208.71432495117188, "logps/rejected": -219.8221435546875, "loss": 0.6489, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023995716124773026, "rewards/margins": 0.09224376827478409, "rewards/margins_max": 0.1384916752576828, "rewards/margins_min": 0.045995842665433884, "rewards/margins_std": 0.06540443003177643, "rewards/rejected": -0.06824804842472076, "step": 1810 }, { "epoch": 0.69, "grad_norm": 0.29296875, "learning_rate": 1.315662149959218e-07, "logits/chosen": -1.4520776271820068, "logits/rejected": -1.116135835647583, "logps/chosen": -209.5501251220703, "logps/rejected": -214.80197143554688, "loss": 0.658, "rewards/accuracies": 0.9375, "rewards/chosen": 0.017536520957946777, "rewards/margins": 0.06895823031663895, "rewards/margins_max": 0.09921301901340485, "rewards/margins_min": 0.038703449070453644, "rewards/margins_std": 0.042786724865436554, "rewards/rejected": -0.05142170935869217, "step": 1820 }, { "epoch": 0.7, "grad_norm": 0.337890625, "learning_rate": 1.286558004005338e-07, "logits/chosen": -1.4225823879241943, "logits/rejected": -1.0862594842910767, "logps/chosen": -193.0080108642578, "logps/rejected": -234.2497100830078, "loss": 0.6502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02093338966369629, "rewards/margins": 0.08371131867170334, "rewards/margins_max": 0.11859454959630966, "rewards/margins_min": 0.04882808402180672, "rewards/margins_std": 0.04933235049247742, "rewards/rejected": -0.06277792900800705, "step": 1830 }, { "epoch": 0.7, "grad_norm": 0.3046875, "learning_rate": 1.2576674323558928e-07, "logits/chosen": -1.4744040966033936, "logits/rejected": -1.200323462486267, "logps/chosen": -208.7011260986328, "logps/rejected": -238.9582061767578, "loss": 0.6498, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022860785946249962, "rewards/margins": 0.09148404747247696, "rewards/margins_max": 0.1325657218694687, "rewards/margins_min": 0.05040237307548523, "rewards/margins_std": 0.05809825658798218, "rewards/rejected": -0.06862326711416245, "step": 1840 }, { "epoch": 0.7, "grad_norm": 0.3125, "learning_rate": 1.228995519954183e-07, "logits/chosen": -1.4823616743087769, "logits/rejected": -1.0789930820465088, "logps/chosen": -224.8362274169922, "logps/rejected": -233.29721069335938, "loss": 0.6511, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02483968995511532, "rewards/margins": 0.08517090976238251, "rewards/margins_max": 0.11456477642059326, "rewards/margins_min": 0.05577705428004265, "rewards/margins_std": 0.041569195687770844, "rewards/rejected": -0.06033121794462204, "step": 1850 }, { "epoch": 0.71, "grad_norm": 0.26171875, "learning_rate": 1.2005473132579407e-07, "logits/chosen": -1.466604471206665, "logits/rejected": -1.0293363332748413, "logps/chosen": -217.72970581054688, "logps/rejected": -234.6483154296875, "loss": 0.652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.021058417856693268, "rewards/margins": 0.08528687059879303, "rewards/margins_max": 0.11772221326828003, "rewards/margins_min": 0.05285150930285454, "rewards/margins_std": 0.04587051644921303, "rewards/rejected": -0.06422845274209976, "step": 1860 }, { "epoch": 0.71, "grad_norm": 0.306640625, "learning_rate": 1.1723278193511322e-07, "logits/chosen": -1.4631662368774414, "logits/rejected": -1.0792559385299683, "logps/chosen": -249.8877716064453, "logps/rejected": -277.57452392578125, "loss": 0.6463, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.016841616481542587, "rewards/margins": 0.09164032340049744, "rewards/margins_max": 0.1224864274263382, "rewards/margins_min": 0.060794223099946976, "rewards/margins_std": 0.04362296685576439, "rewards/rejected": -0.07479871064424515, "step": 1870 }, { "epoch": 0.71, "grad_norm": 0.279296875, "learning_rate": 1.1443420050626623e-07, "logits/chosen": -1.4762096405029297, "logits/rejected": -1.2501494884490967, "logps/chosen": -199.69052124023438, "logps/rejected": -204.5517120361328, "loss": 0.6513, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01751863956451416, "rewards/margins": 0.0781446248292923, "rewards/margins_max": 0.11569873243570328, "rewards/margins_min": 0.04059051349759102, "rewards/margins_std": 0.05310952663421631, "rewards/rejected": -0.06062598153948784, "step": 1880 }, { "epoch": 0.72, "grad_norm": 0.353515625, "learning_rate": 1.1165947960921868e-07, "logits/chosen": -1.4795281887054443, "logits/rejected": -1.089855670928955, "logps/chosen": -247.3242645263672, "logps/rejected": -246.82656860351562, "loss": 0.6495, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.011459515430033207, "rewards/margins": 0.0838027149438858, "rewards/margins_max": 0.11774241924285889, "rewards/margins_min": 0.04986302927136421, "rewards/margins_std": 0.04799797758460045, "rewards/rejected": -0.07234319299459457, "step": 1890 }, { "epoch": 0.72, "grad_norm": 0.34765625, "learning_rate": 1.0890910761431491e-07, "logits/chosen": -1.3522553443908691, "logits/rejected": -1.0552892684936523, "logps/chosen": -207.72488403320312, "logps/rejected": -233.9858856201172, "loss": 0.65, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020823100581765175, "rewards/margins": 0.08793775737285614, "rewards/margins_max": 0.12979640066623688, "rewards/margins_min": 0.04607909545302391, "rewards/margins_std": 0.059197068214416504, "rewards/rejected": -0.06711465120315552, "step": 1900 }, { "epoch": 0.73, "grad_norm": 0.2431640625, "learning_rate": 1.0618356860632208e-07, "logits/chosen": -1.486316442489624, "logits/rejected": -1.1503719091415405, "logps/chosen": -191.83619689941406, "logps/rejected": -218.7319793701172, "loss": 0.6508, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.029194846749305725, "rewards/margins": 0.0852990448474884, "rewards/margins_max": 0.11882505565881729, "rewards/margins_min": 0.05177304893732071, "rewards/margins_std": 0.047412920743227005, "rewards/rejected": -0.05610420182347298, "step": 1910 }, { "epoch": 0.73, "grad_norm": 0.3046875, "learning_rate": 1.0348334229922676e-07, "logits/chosen": -1.3859325647354126, "logits/rejected": -1.1395026445388794, "logps/chosen": -197.09481811523438, "logps/rejected": -218.48251342773438, "loss": 0.6476, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0274784155189991, "rewards/margins": 0.1017388105392456, "rewards/margins_max": 0.14634063839912415, "rewards/margins_min": 0.05713699012994766, "rewards/margins_std": 0.06307649612426758, "rewards/rejected": -0.07426039129495621, "step": 1920 }, { "epoch": 0.73, "grad_norm": 0.265625, "learning_rate": 1.0080890395180328e-07, "logits/chosen": -1.4809856414794922, "logits/rejected": -1.1530823707580566, "logps/chosen": -207.95291137695312, "logps/rejected": -221.2301788330078, "loss": 0.6498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022588089108467102, "rewards/margins": 0.09043975919485092, "rewards/margins_max": 0.1265183687210083, "rewards/margins_min": 0.05436115339398384, "rewards/margins_std": 0.05102284997701645, "rewards/rejected": -0.06785167008638382, "step": 1930 }, { "epoch": 0.74, "grad_norm": 0.294921875, "learning_rate": 9.816072428396374e-08, "logits/chosen": -1.5341602563858032, "logits/rejected": -1.2367621660232544, "logps/chosen": -221.6543426513672, "logps/rejected": -219.3003387451172, "loss": 0.6499, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017101837322115898, "rewards/margins": 0.0833502784371376, "rewards/margins_max": 0.11911274492740631, "rewards/margins_min": 0.0475878044962883, "rewards/margins_std": 0.050575774163007736, "rewards/rejected": -0.06624843925237656, "step": 1940 }, { "epoch": 0.74, "grad_norm": 0.30078125, "learning_rate": 9.553926939390847e-08, "logits/chosen": -1.348481297492981, "logits/rejected": -1.092132329940796, "logps/chosen": -180.2677001953125, "logps/rejected": -206.4167938232422, "loss": 0.6517, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020975422114133835, "rewards/margins": 0.08831629902124405, "rewards/margins_max": 0.12501828372478485, "rewards/margins_min": 0.05161432549357414, "rewards/margins_std": 0.051904432475566864, "rewards/rejected": -0.06734088063240051, "step": 1950 }, { "epoch": 0.74, "grad_norm": 0.3046875, "learning_rate": 9.29450006760894e-08, "logits/chosen": -1.4164844751358032, "logits/rejected": -1.0626654624938965, "logps/chosen": -217.0990753173828, "logps/rejected": -231.00747680664062, "loss": 0.6462, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.025280630216002464, "rewards/margins": 0.08944814652204514, "rewards/margins_max": 0.12857092916965485, "rewards/margins_min": 0.05032537132501602, "rewards/margins_std": 0.05532795935869217, "rewards/rejected": -0.06416751444339752, "step": 1960 }, { "epoch": 0.75, "grad_norm": 0.31640625, "learning_rate": 9.03783747400017e-08, "logits/chosen": -1.5357427597045898, "logits/rejected": -1.2725077867507935, "logps/chosen": -204.5410919189453, "logps/rejected": -243.62899780273438, "loss": 0.6459, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02568015828728676, "rewards/margins": 0.10661699622869492, "rewards/margins_max": 0.14371469616889954, "rewards/margins_min": 0.0695192739367485, "rewards/margins_std": 0.05246409773826599, "rewards/rejected": -0.08093683421611786, "step": 1970 }, { "epoch": 0.75, "grad_norm": 0.26171875, "learning_rate": 8.783984332981648e-08, "logits/chosen": -1.5041474103927612, "logits/rejected": -1.130979299545288, "logps/chosen": -194.86923217773438, "logps/rejected": -220.365478515625, "loss": 0.6489, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024836743250489235, "rewards/margins": 0.09214451909065247, "rewards/margins_max": 0.12188988924026489, "rewards/margins_min": 0.062399156391620636, "rewards/margins_std": 0.0420663021504879, "rewards/rejected": -0.06730777770280838, "step": 1980 }, { "epoch": 0.76, "grad_norm": 0.3046875, "learning_rate": 8.532985324487171e-08, "logits/chosen": -1.4796479940414429, "logits/rejected": -1.1508190631866455, "logps/chosen": -184.7924346923828, "logps/rejected": -212.42373657226562, "loss": 0.6484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01795900985598564, "rewards/margins": 0.09584374725818634, "rewards/margins_max": 0.13568690419197083, "rewards/margins_min": 0.05600060150027275, "rewards/margins_std": 0.05634673312306404, "rewards/rejected": -0.0778847485780716, "step": 1990 }, { "epoch": 0.76, "grad_norm": 0.29296875, "learning_rate": 8.284884626103164e-08, "logits/chosen": -1.4941637516021729, "logits/rejected": -1.2435134649276733, "logps/chosen": -188.8251190185547, "logps/rejected": -210.8880157470703, "loss": 0.6503, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02011449821293354, "rewards/margins": 0.08227307349443436, "rewards/margins_max": 0.11268408596515656, "rewards/margins_min": 0.05186206102371216, "rewards/margins_std": 0.04300766438245773, "rewards/rejected": -0.06215857341885567, "step": 2000 }, { "epoch": 0.76, "grad_norm": 0.2890625, "learning_rate": 8.039725905293138e-08, "logits/chosen": -1.3546192646026611, "logits/rejected": -1.0556727647781372, "logps/chosen": -189.37301635742188, "logps/rejected": -232.6674041748047, "loss": 0.6513, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02390899695456028, "rewards/margins": 0.0961272269487381, "rewards/margins_max": 0.14148414134979248, "rewards/margins_min": 0.05077032372355461, "rewards/margins_std": 0.06414436548948288, "rewards/rejected": -0.07221823185682297, "step": 2010 }, { "epoch": 0.77, "grad_norm": 0.30078125, "learning_rate": 7.797552311711905e-08, "logits/chosen": -1.5595532655715942, "logits/rejected": -1.205398678779602, "logps/chosen": -210.8866729736328, "logps/rejected": -214.4613800048828, "loss": 0.652, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01619151420891285, "rewards/margins": 0.07569371163845062, "rewards/margins_max": 0.11168579757213593, "rewards/margins_min": 0.039701610803604126, "rewards/margins_std": 0.05090050771832466, "rewards/rejected": -0.059502195566892624, "step": 2020 }, { "epoch": 0.77, "grad_norm": 0.359375, "learning_rate": 7.558406469610981e-08, "logits/chosen": -1.479150414466858, "logits/rejected": -1.1994903087615967, "logps/chosen": -203.69873046875, "logps/rejected": -213.0478973388672, "loss": 0.6519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01320022065192461, "rewards/margins": 0.08063283562660217, "rewards/margins_max": 0.12000874429941177, "rewards/margins_min": 0.04125692695379257, "rewards/margins_std": 0.055685948580503464, "rewards/rejected": -0.06743261963129044, "step": 2030 }, { "epoch": 0.78, "grad_norm": 0.328125, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.5190201997756958, "logits/rejected": -1.1927928924560547, "logps/chosen": -202.38818359375, "logps/rejected": -222.53903198242188, "loss": 0.6523, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.016741709783673286, "rewards/margins": 0.08067712187767029, "rewards/margins_max": 0.11362887918949127, "rewards/margins_min": 0.047725364565849304, "rewards/margins_std": 0.0466008223593235, "rewards/rejected": -0.06393541395664215, "step": 2040 }, { "epoch": 0.78, "grad_norm": 0.32421875, "learning_rate": 7.08936586492003e-08, "logits/chosen": -1.3734339475631714, "logits/rejected": -1.0133839845657349, "logps/chosen": -206.0966339111328, "logps/rejected": -218.1828155517578, "loss": 0.6502, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014113351702690125, "rewards/margins": 0.08260734379291534, "rewards/margins_max": 0.12146137654781342, "rewards/margins_min": 0.043753307312726974, "rewards/margins_std": 0.054947901517152786, "rewards/rejected": -0.06849398463964462, "step": 2050 }, { "epoch": 0.78, "grad_norm": 0.345703125, "learning_rate": 6.859553656767112e-08, "logits/chosen": -1.3706634044647217, "logits/rejected": -1.0720088481903076, "logps/chosen": -220.724853515625, "logps/rejected": -234.53756713867188, "loss": 0.6471, "rewards/accuracies": 0.9375, "rewards/chosen": 0.017995189875364304, "rewards/margins": 0.0858609676361084, "rewards/margins_max": 0.13481369614601135, "rewards/margins_min": 0.03690824285149574, "rewards/margins_std": 0.0692296102643013, "rewards/rejected": -0.0678657740354538, "step": 2060 }, { "epoch": 0.79, "grad_norm": 0.3125, "learning_rate": 6.63293429443845e-08, "logits/chosen": -1.484061360359192, "logits/rejected": -1.1830289363861084, "logps/chosen": -224.13644409179688, "logps/rejected": -251.21792602539062, "loss": 0.6527, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.020492028445005417, "rewards/margins": 0.0933627113699913, "rewards/margins_max": 0.1307709515094757, "rewards/margins_min": 0.055954478681087494, "rewards/margins_std": 0.052903227508068085, "rewards/rejected": -0.07287068665027618, "step": 2070 }, { "epoch": 0.79, "grad_norm": 0.375, "learning_rate": 6.409547664531733e-08, "logits/chosen": -1.392897367477417, "logits/rejected": -1.0433756113052368, "logps/chosen": -233.62039184570312, "logps/rejected": -221.968994140625, "loss": 0.6485, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.017711572349071503, "rewards/margins": 0.09063192456960678, "rewards/margins_max": 0.1303481161594391, "rewards/margins_min": 0.05091572925448418, "rewards/margins_std": 0.05616719275712967, "rewards/rejected": -0.07292035222053528, "step": 2080 }, { "epoch": 0.79, "grad_norm": 0.287109375, "learning_rate": 6.189433084661031e-08, "logits/chosen": -1.441162347793579, "logits/rejected": -1.0751087665557861, "logps/chosen": -211.349365234375, "logps/rejected": -226.0006103515625, "loss": 0.6491, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.017282210290431976, "rewards/margins": 0.0929594486951828, "rewards/margins_max": 0.13247843086719513, "rewards/margins_min": 0.05344045162200928, "rewards/margins_std": 0.05588828772306442, "rewards/rejected": -0.07567723095417023, "step": 2090 }, { "epoch": 0.8, "grad_norm": 0.302734375, "learning_rate": 5.972629296536655e-08, "logits/chosen": -1.398723840713501, "logits/rejected": -1.0770161151885986, "logps/chosen": -208.3170928955078, "logps/rejected": -220.9575653076172, "loss": 0.6522, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021888596937060356, "rewards/margins": 0.08464206010103226, "rewards/margins_max": 0.12029530853033066, "rewards/margins_min": 0.04898880049586296, "rewards/margins_std": 0.05042130872607231, "rewards/rejected": -0.06275346130132675, "step": 2100 }, { "epoch": 0.8, "grad_norm": 0.314453125, "learning_rate": 5.7591744591463375e-08, "logits/chosen": -1.4472309350967407, "logits/rejected": -1.021103858947754, "logps/chosen": -226.20626831054688, "logps/rejected": -214.0725860595703, "loss": 0.6477, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017262738198041916, "rewards/margins": 0.09329278767108917, "rewards/margins_max": 0.13951919972896576, "rewards/margins_min": 0.04706636443734169, "rewards/margins_std": 0.06537402421236038, "rewards/rejected": -0.07603004574775696, "step": 2110 }, { "epoch": 0.81, "grad_norm": 0.314453125, "learning_rate": 5.5491061420390174e-08, "logits/chosen": -1.461857557296753, "logits/rejected": -1.0689175128936768, "logps/chosen": -231.24649047851562, "logps/rejected": -246.7604217529297, "loss": 0.6547, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.019667720422148705, "rewards/margins": 0.08514972031116486, "rewards/margins_max": 0.12298393249511719, "rewards/margins_min": 0.04731552302837372, "rewards/margins_std": 0.05350564047694206, "rewards/rejected": -0.0654820054769516, "step": 2120 }, { "epoch": 0.81, "grad_norm": 0.271484375, "learning_rate": 5.342461318712252e-08, "logits/chosen": -1.4466235637664795, "logits/rejected": -1.14009690284729, "logps/chosen": -176.4794158935547, "logps/rejected": -235.32144165039062, "loss": 0.6488, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.018947120755910873, "rewards/margins": 0.09472303092479706, "rewards/margins_max": 0.13757546246051788, "rewards/margins_min": 0.05187060683965683, "rewards/margins_std": 0.060602474957704544, "rewards/rejected": -0.07577590644359589, "step": 2130 }, { "epoch": 0.81, "grad_norm": 0.365234375, "learning_rate": 5.1392763601047244e-08, "logits/chosen": -1.4163846969604492, "logits/rejected": -1.0834262371063232, "logps/chosen": -187.30274963378906, "logps/rejected": -220.01986694335938, "loss": 0.645, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01945066824555397, "rewards/margins": 0.09664531797170639, "rewards/margins_max": 0.13591603934764862, "rewards/margins_min": 0.057374607771635056, "rewards/margins_std": 0.05553717538714409, "rewards/rejected": -0.07719465345144272, "step": 2140 }, { "epoch": 0.82, "grad_norm": 0.326171875, "learning_rate": 4.939587028194625e-08, "logits/chosen": -1.4588258266448975, "logits/rejected": -1.075309157371521, "logps/chosen": -231.6609344482422, "logps/rejected": -194.46878051757812, "loss": 0.6523, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013223061338067055, "rewards/margins": 0.08043862879276276, "rewards/margins_max": 0.11344832181930542, "rewards/margins_min": 0.0474289245903492, "rewards/margins_std": 0.04668276757001877, "rewards/rejected": -0.06721556931734085, "step": 2150 }, { "epoch": 0.82, "grad_norm": 0.345703125, "learning_rate": 4.743428469705335e-08, "logits/chosen": -1.4122099876403809, "logits/rejected": -1.1665161848068237, "logps/chosen": -210.12646484375, "logps/rejected": -255.7862091064453, "loss": 0.6507, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015928715467453003, "rewards/margins": 0.08897783607244492, "rewards/margins_max": 0.12421097606420517, "rewards/margins_min": 0.053744666278362274, "rewards/margins_std": 0.0498272180557251, "rewards/rejected": -0.07304911315441132, "step": 2160 }, { "epoch": 0.82, "grad_norm": 0.322265625, "learning_rate": 4.550835209919326e-08, "logits/chosen": -1.3729497194290161, "logits/rejected": -1.0899299383163452, "logps/chosen": -202.81869506835938, "logps/rejected": -222.53366088867188, "loss": 0.6513, "rewards/accuracies": 1.0, "rewards/chosen": 0.021017426624894142, "rewards/margins": 0.0997917577624321, "rewards/margins_max": 0.1454765796661377, "rewards/margins_min": 0.0541069433093071, "rewards/margins_std": 0.06460809707641602, "rewards/rejected": -0.0787743479013443, "step": 2170 }, { "epoch": 0.83, "grad_norm": 0.33984375, "learning_rate": 4.361841146601516e-08, "logits/chosen": -1.3598549365997314, "logits/rejected": -1.1428276300430298, "logps/chosen": -225.26675415039062, "logps/rejected": -262.83294677734375, "loss": 0.647, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02677268348634243, "rewards/margins": 0.10582437366247177, "rewards/margins_max": 0.157110795378685, "rewards/margins_min": 0.05453797057271004, "rewards/margins_std": 0.07252994179725647, "rewards/rejected": -0.07905169576406479, "step": 2180 }, { "epoch": 0.83, "grad_norm": 0.30078125, "learning_rate": 4.1764795440329516e-08, "logits/chosen": -1.4309017658233643, "logits/rejected": -1.1507847309112549, "logps/chosen": -207.42568969726562, "logps/rejected": -230.8451385498047, "loss": 0.6523, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01876028999686241, "rewards/margins": 0.08968784660100937, "rewards/margins_max": 0.12752839922904968, "rewards/margins_min": 0.05184728652238846, "rewards/margins_std": 0.05351463705301285, "rewards/rejected": -0.07092756032943726, "step": 2190 }, { "epoch": 0.84, "grad_norm": 0.376953125, "learning_rate": 3.994783027156143e-08, "logits/chosen": -1.5498156547546387, "logits/rejected": -1.1738865375518799, "logps/chosen": -235.3194122314453, "logps/rejected": -264.9289855957031, "loss": 0.652, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020533021539449692, "rewards/margins": 0.09463083744049072, "rewards/margins_max": 0.1284172236919403, "rewards/margins_min": 0.06084446236491203, "rewards/margins_std": 0.04778115078806877, "rewards/rejected": -0.07409781217575073, "step": 2200 }, { "epoch": 0.84, "grad_norm": 0.3203125, "learning_rate": 3.81678357583278e-08, "logits/chosen": -1.387663722038269, "logits/rejected": -1.0362415313720703, "logps/chosen": -201.65501403808594, "logps/rejected": -223.9352569580078, "loss": 0.6507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.019265640527009964, "rewards/margins": 0.08608859777450562, "rewards/margins_max": 0.11621971428394318, "rewards/margins_min": 0.055957477539777756, "rewards/margins_std": 0.04261183738708496, "rewards/rejected": -0.06682296097278595, "step": 2210 }, { "epoch": 0.84, "grad_norm": 0.30859375, "learning_rate": 3.6425125192150854e-08, "logits/chosen": -1.3626813888549805, "logits/rejected": -1.0685607194900513, "logps/chosen": -202.44815063476562, "logps/rejected": -236.275146484375, "loss": 0.6504, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022014940157532692, "rewards/margins": 0.10008475929498672, "rewards/margins_max": 0.1432458460330963, "rewards/margins_min": 0.05692365765571594, "rewards/margins_std": 0.061039019376039505, "rewards/rejected": -0.07806982100009918, "step": 2220 }, { "epoch": 0.85, "grad_norm": 0.26953125, "learning_rate": 3.4720005302316555e-08, "logits/chosen": -1.4205926656723022, "logits/rejected": -1.059009313583374, "logps/chosen": -204.2139434814453, "logps/rejected": -223.8666534423828, "loss": 0.6446, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.025852352380752563, "rewards/margins": 0.10249295085668564, "rewards/margins_max": 0.1471790224313736, "rewards/margins_min": 0.057806871831417084, "rewards/margins_std": 0.06319564580917358, "rewards/rejected": -0.07664059847593307, "step": 2230 }, { "epoch": 0.85, "grad_norm": 0.294921875, "learning_rate": 3.305277620188826e-08, "logits/chosen": -1.4504072666168213, "logits/rejected": -1.0379372835159302, "logps/chosen": -191.2012939453125, "logps/rejected": -193.94325256347656, "loss": 0.6515, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.022001946344971657, "rewards/margins": 0.08406315743923187, "rewards/margins_max": 0.12489868700504303, "rewards/margins_min": 0.04322761669754982, "rewards/margins_std": 0.057750165462493896, "rewards/rejected": -0.06206120178103447, "step": 2240 }, { "epoch": 0.85, "grad_norm": 0.330078125, "learning_rate": 3.142373133488416e-08, "logits/chosen": -1.4936898946762085, "logits/rejected": -1.0924030542373657, "logps/chosen": -209.015380859375, "logps/rejected": -224.1560821533203, "loss": 0.6471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02342359907925129, "rewards/margins": 0.0905032753944397, "rewards/margins_max": 0.12563428282737732, "rewards/margins_min": 0.05537227541208267, "rewards/margins_std": 0.04968274012207985, "rewards/rejected": -0.06707967817783356, "step": 2250 }, { "epoch": 0.86, "grad_norm": 0.302734375, "learning_rate": 2.9833157424629965e-08, "logits/chosen": -1.486553430557251, "logits/rejected": -1.2344576120376587, "logps/chosen": -190.970703125, "logps/rejected": -235.8113555908203, "loss": 0.6472, "rewards/accuracies": 0.9375, "rewards/chosen": 0.025419479236006737, "rewards/margins": 0.10403518378734589, "rewards/margins_max": 0.1522940993309021, "rewards/margins_min": 0.05577626824378967, "rewards/margins_std": 0.06824841350317001, "rewards/rejected": -0.078615702688694, "step": 2260 }, { "epoch": 0.86, "grad_norm": 0.27734375, "learning_rate": 2.8281334423292752e-08, "logits/chosen": -1.405575156211853, "logits/rejected": -1.1565072536468506, "logps/chosen": -184.67291259765625, "logps/rejected": -215.6078338623047, "loss": 0.6489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.022038763388991356, "rewards/margins": 0.08945713192224503, "rewards/margins_max": 0.12467725574970245, "rewards/margins_min": 0.05423703044652939, "rewards/margins_std": 0.049808748066425323, "rewards/rejected": -0.06741837412118912, "step": 2270 }, { "epoch": 0.87, "grad_norm": 0.240234375, "learning_rate": 2.6768535462607905e-08, "logits/chosen": -1.405057668685913, "logits/rejected": -1.0886411666870117, "logps/chosen": -207.4287109375, "logps/rejected": -224.9873809814453, "loss": 0.6502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021886788308620453, "rewards/margins": 0.09229324758052826, "rewards/margins_max": 0.13078710436820984, "rewards/margins_min": 0.053799428045749664, "rewards/margins_std": 0.054438501596450806, "rewards/rejected": -0.0704064816236496, "step": 2280 }, { "epoch": 0.87, "grad_norm": 0.3046875, "learning_rate": 2.529502680580578e-08, "logits/chosen": -1.4395860433578491, "logits/rejected": -1.1944835186004639, "logps/chosen": -197.48228454589844, "logps/rejected": -224.9473419189453, "loss": 0.6498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017150208353996277, "rewards/margins": 0.0919913500547409, "rewards/margins_max": 0.13415281474590302, "rewards/margins_min": 0.049829889088869095, "rewards/margins_std": 0.05962531641125679, "rewards/rejected": -0.07484114170074463, "step": 2290 }, { "epoch": 0.87, "grad_norm": 0.296875, "learning_rate": 2.386106780074784e-08, "logits/chosen": -1.432604432106018, "logits/rejected": -1.134526014328003, "logps/chosen": -207.81283569335938, "logps/rejected": -234.94784545898438, "loss": 0.6503, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.022798217833042145, "rewards/margins": 0.09147022664546967, "rewards/margins_max": 0.13085611164569855, "rewards/margins_min": 0.05208434537053108, "rewards/margins_std": 0.055700045078992844, "rewards/rejected": -0.06867200881242752, "step": 2300 }, { "epoch": 0.88, "grad_norm": 0.318359375, "learning_rate": 2.2466910834278957e-08, "logits/chosen": -1.4143720865249634, "logits/rejected": -1.069481611251831, "logps/chosen": -207.4478759765625, "logps/rejected": -218.31716918945312, "loss": 0.6489, "rewards/accuracies": 1.0, "rewards/chosen": 0.021387558430433273, "rewards/margins": 0.09235554933547974, "rewards/margins_max": 0.13592186570167542, "rewards/margins_min": 0.048789240419864655, "rewards/margins_std": 0.0616120770573616, "rewards/rejected": -0.07096799463033676, "step": 2310 }, { "epoch": 0.88, "grad_norm": 0.337890625, "learning_rate": 2.1112801287806375e-08, "logits/chosen": -1.3332234621047974, "logits/rejected": -1.115999698638916, "logps/chosen": -195.05917358398438, "logps/rejected": -202.2626495361328, "loss": 0.6477, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01643414981663227, "rewards/margins": 0.0895700454711914, "rewards/margins_max": 0.12620273232460022, "rewards/margins_min": 0.0529373399913311, "rewards/margins_std": 0.051806457340717316, "rewards/rejected": -0.07313589006662369, "step": 2320 }, { "epoch": 0.89, "grad_norm": 0.302734375, "learning_rate": 1.9798977494110274e-08, "logits/chosen": -1.473491907119751, "logits/rejected": -1.1137970685958862, "logps/chosen": -213.4473114013672, "logps/rejected": -226.2127685546875, "loss": 0.65, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.024023082107305527, "rewards/margins": 0.09007260203361511, "rewards/margins_max": 0.12320313602685928, "rewards/margins_min": 0.056942082941532135, "rewards/margins_std": 0.04685363173484802, "rewards/rejected": -0.06604952365159988, "step": 2330 }, { "epoch": 0.89, "grad_norm": 0.33203125, "learning_rate": 1.852567069539568e-08, "logits/chosen": -1.4035086631774902, "logits/rejected": -1.0367395877838135, "logps/chosen": -222.76388549804688, "logps/rejected": -218.81900024414062, "loss": 0.6472, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01737220212817192, "rewards/margins": 0.09154538065195084, "rewards/margins_max": 0.12961694598197937, "rewards/margins_min": 0.053473830223083496, "rewards/margins_std": 0.05384131520986557, "rewards/rejected": -0.07417318224906921, "step": 2340 }, { "epoch": 0.89, "grad_norm": 0.345703125, "learning_rate": 1.729310500259229e-08, "logits/chosen": -1.438696265220642, "logits/rejected": -1.103390097618103, "logps/chosen": -204.58889770507812, "logps/rejected": -197.52224731445312, "loss": 0.6518, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.016056453809142113, "rewards/margins": 0.08784165978431702, "rewards/margins_max": 0.13728336989879608, "rewards/margins_min": 0.03839995712041855, "rewards/margins_std": 0.06992112845182419, "rewards/rejected": -0.07178520411252975, "step": 2350 }, { "epoch": 0.9, "grad_norm": 0.298828125, "learning_rate": 1.610149735590949e-08, "logits/chosen": -1.49490225315094, "logits/rejected": -1.0492806434631348, "logps/chosen": -233.8212432861328, "logps/rejected": -238.4543914794922, "loss": 0.6482, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01798924431204796, "rewards/margins": 0.08889796584844589, "rewards/margins_max": 0.12274952232837677, "rewards/margins_min": 0.05504639074206352, "rewards/margins_std": 0.04787334054708481, "rewards/rejected": -0.07090871036052704, "step": 2360 }, { "epoch": 0.9, "grad_norm": 0.376953125, "learning_rate": 1.4951057486652845e-08, "logits/chosen": -1.4752815961837769, "logits/rejected": -1.0967390537261963, "logps/chosen": -215.08462524414062, "logps/rejected": -239.4650115966797, "loss": 0.6474, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.016311800107359886, "rewards/margins": 0.09782110154628754, "rewards/margins_max": 0.14107748866081238, "rewards/margins_min": 0.0545647032558918, "rewards/margins_std": 0.06117378547787666, "rewards/rejected": -0.0815092995762825, "step": 2370 }, { "epoch": 0.9, "grad_norm": 0.27734375, "learning_rate": 1.384198788031063e-08, "logits/chosen": -1.40728759765625, "logits/rejected": -1.0882267951965332, "logps/chosen": -215.80819702148438, "logps/rejected": -219.1876678466797, "loss": 0.655, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.013580176047980785, "rewards/margins": 0.08551829308271408, "rewards/margins_max": 0.13581404089927673, "rewards/margins_min": 0.035222552716732025, "rewards/margins_std": 0.07112891972064972, "rewards/rejected": -0.07193811982870102, "step": 2380 }, { "epoch": 0.91, "grad_norm": 0.318359375, "learning_rate": 1.2774483740914416e-08, "logits/chosen": -1.4530203342437744, "logits/rejected": -1.204655408859253, "logps/chosen": -201.9489288330078, "logps/rejected": -252.07650756835938, "loss": 0.6512, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02443147823214531, "rewards/margins": 0.08860547840595245, "rewards/margins_max": 0.12207148969173431, "rewards/margins_min": 0.05513947084546089, "rewards/margins_std": 0.04732808098196983, "rewards/rejected": -0.06417400389909744, "step": 2390 }, { "epoch": 0.91, "grad_norm": 0.271484375, "learning_rate": 1.1748732956682023e-08, "logits/chosen": -1.3249019384384155, "logits/rejected": -1.0664135217666626, "logps/chosen": -185.0057373046875, "logps/rejected": -204.3482208251953, "loss": 0.6547, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.018103353679180145, "rewards/margins": 0.08620314300060272, "rewards/margins_max": 0.1258658468723297, "rewards/margins_min": 0.04654044285416603, "rewards/margins_std": 0.056091535836458206, "rewards/rejected": -0.06809979677200317, "step": 2400 }, { "epoch": 0.92, "grad_norm": 0.33203125, "learning_rate": 1.0764916066947794e-08, "logits/chosen": -1.5028488636016846, "logits/rejected": -1.139801025390625, "logps/chosen": -216.3588104248047, "logps/rejected": -209.8426055908203, "loss": 0.6507, "rewards/accuracies": 1.0, "rewards/chosen": 0.012566794641315937, "rewards/margins": 0.08007965236902237, "rewards/margins_max": 0.11527623981237411, "rewards/margins_min": 0.04488305002450943, "rewards/margins_std": 0.04977550357580185, "rewards/rejected": -0.06751285493373871, "step": 2410 }, { "epoch": 0.92, "grad_norm": 0.388671875, "learning_rate": 9.823206230386515e-09, "logits/chosen": -1.4211446046829224, "logits/rejected": -1.1179053783416748, "logps/chosen": -196.47525024414062, "logps/rejected": -217.45126342773438, "loss": 0.6468, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.019422296434640884, "rewards/margins": 0.10285399109125137, "rewards/margins_max": 0.14698545634746552, "rewards/margins_min": 0.058722518384456635, "rewards/margins_std": 0.062411319464445114, "rewards/rejected": -0.0834316834807396, "step": 2420 }, { "epoch": 0.92, "grad_norm": 0.412109375, "learning_rate": 8.923769194536218e-09, "logits/chosen": -1.3940680027008057, "logits/rejected": -1.283018708229065, "logps/chosen": -179.62240600585938, "logps/rejected": -204.8430938720703, "loss": 0.652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.018096450716257095, "rewards/margins": 0.07983745634555817, "rewards/margins_max": 0.11743442714214325, "rewards/margins_min": 0.04224049299955368, "rewards/margins_std": 0.053170137107372284, "rewards/rejected": -0.06174100562930107, "step": 2430 }, { "epoch": 0.93, "grad_norm": 0.3203125, "learning_rate": 8.066763266625282e-09, "logits/chosen": -1.453731894493103, "logits/rejected": -1.2243844270706177, "logps/chosen": -181.93214416503906, "logps/rejected": -204.25790405273438, "loss": 0.6461, "rewards/accuracies": 1.0, "rewards/chosen": 0.019669560715556145, "rewards/margins": 0.09502781927585602, "rewards/margins_max": 0.13680796325206757, "rewards/margins_min": 0.05324765294790268, "rewards/margins_std": 0.059086065739393234, "rewards/rejected": -0.07535825669765472, "step": 2440 }, { "epoch": 0.93, "grad_norm": 0.330078125, "learning_rate": 7.252339285709619e-09, "logits/chosen": -1.5105646848678589, "logits/rejected": -1.1885838508605957, "logps/chosen": -191.6083984375, "logps/rejected": -217.36001586914062, "loss": 0.6523, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.025863632559776306, "rewards/margins": 0.08741247653961182, "rewards/margins_max": 0.12167079746723175, "rewards/margins_min": 0.05315415933728218, "rewards/margins_std": 0.048448581248521805, "rewards/rejected": -0.06154884770512581, "step": 2450 }, { "epoch": 0.93, "grad_norm": 0.296875, "learning_rate": 6.480640596123549e-09, "logits/chosen": -1.3854317665100098, "logits/rejected": -1.246504306793213, "logps/chosen": -206.4389190673828, "logps/rejected": -241.0319061279297, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 0.02236495353281498, "rewards/margins": 0.09376935660839081, "rewards/margins_max": 0.1314789205789566, "rewards/margins_min": 0.056059788912534714, "rewards/margins_std": 0.053329385817050934, "rewards/rejected": -0.07140441238880157, "step": 2460 }, { "epoch": 0.94, "grad_norm": 0.3046875, "learning_rate": 5.751803022250479e-09, "logits/chosen": -1.5128895044326782, "logits/rejected": -1.1328929662704468, "logps/chosen": -252.16366577148438, "logps/rejected": -239.92733764648438, "loss": 0.6499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.014960886910557747, "rewards/margins": 0.07866047322750092, "rewards/margins_max": 0.11670245975255966, "rewards/margins_min": 0.04061848670244217, "rewards/margins_std": 0.053799498826265335, "rewards/rejected": -0.06369959563016891, "step": 2470 }, { "epoch": 0.94, "grad_norm": 0.3125, "learning_rate": 5.065954844616721e-09, "logits/chosen": -1.308065414428711, "logits/rejected": -1.095858097076416, "logps/chosen": -185.49024963378906, "logps/rejected": -205.8270263671875, "loss": 0.6528, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02121925540268421, "rewards/margins": 0.09081675857305527, "rewards/margins_max": 0.12591932713985443, "rewards/margins_min": 0.0557141974568367, "rewards/margins_std": 0.04964253306388855, "rewards/rejected": -0.0695975124835968, "step": 2480 }, { "epoch": 0.95, "grad_norm": 0.34375, "learning_rate": 4.4232167773132215e-09, "logits/chosen": -1.3658572435379028, "logits/rejected": -1.1204966306686401, "logps/chosen": -187.7608642578125, "logps/rejected": -222.4073486328125, "loss": 0.6518, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.014262977056205273, "rewards/margins": 0.08244883269071579, "rewards/margins_max": 0.11362477391958237, "rewards/margins_min": 0.051272887736558914, "rewards/margins_std": 0.04408944398164749, "rewards/rejected": -0.06818585842847824, "step": 2490 }, { "epoch": 0.95, "grad_norm": 0.3125, "learning_rate": 3.823701946749053e-09, "logits/chosen": -1.4690792560577393, "logits/rejected": -1.1181033849716187, "logps/chosen": -185.13882446289062, "logps/rejected": -213.7282257080078, "loss": 0.649, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02593539096415043, "rewards/margins": 0.08635548502206802, "rewards/margins_max": 0.12079595029354095, "rewards/margins_min": 0.0519150085747242, "rewards/margins_std": 0.04870619252324104, "rewards/rejected": -0.06042008846998215, "step": 2500 }, { "epoch": 0.95, "grad_norm": 0.318359375, "learning_rate": 3.267515871740484e-09, "logits/chosen": -1.426261067390442, "logits/rejected": -1.221677541732788, "logps/chosen": -185.28851318359375, "logps/rejected": -215.24221801757812, "loss": 0.6442, "rewards/accuracies": 1.0, "rewards/chosen": 0.018279213458299637, "rewards/margins": 0.10030057281255722, "rewards/margins_max": 0.13936129212379456, "rewards/margins_min": 0.06123984977602959, "rewards/margins_std": 0.055240195244550705, "rewards/rejected": -0.08202135562896729, "step": 2510 }, { "epoch": 0.96, "grad_norm": 0.30078125, "learning_rate": 2.754756444938666e-09, "logits/chosen": -1.5783092975616455, "logits/rejected": -1.1907222270965576, "logps/chosen": -210.3064727783203, "logps/rejected": -241.8881378173828, "loss": 0.6526, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.016610082238912582, "rewards/margins": 0.08524598181247711, "rewards/margins_max": 0.12586773931980133, "rewards/margins_min": 0.04462422430515289, "rewards/margins_std": 0.05744783952832222, "rewards/rejected": -0.06863589584827423, "step": 2520 }, { "epoch": 0.96, "grad_norm": 0.28515625, "learning_rate": 2.285513915600168e-09, "logits/chosen": -1.398267388343811, "logits/rejected": -1.142956018447876, "logps/chosen": -180.8777618408203, "logps/rejected": -212.1314239501953, "loss": 0.652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.019063105806708336, "rewards/margins": 0.08323358744382858, "rewards/margins_max": 0.11851847171783447, "rewards/margins_min": 0.04794871434569359, "rewards/margins_std": 0.0499003566801548, "rewards/rejected": -0.0641704872250557, "step": 2530 }, { "epoch": 0.97, "grad_norm": 0.322265625, "learning_rate": 1.859870873702124e-09, "logits/chosen": -1.4865785837173462, "logits/rejected": -1.0758774280548096, "logps/chosen": -229.53744506835938, "logps/rejected": -225.7979736328125, "loss": 0.6474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027804816141724586, "rewards/margins": 0.10115760564804077, "rewards/margins_max": 0.1517588049173355, "rewards/margins_min": 0.05055641010403633, "rewards/margins_std": 0.07156090438365936, "rewards/rejected": -0.07335279136896133, "step": 2540 }, { "epoch": 0.97, "grad_norm": 0.326171875, "learning_rate": 1.4779022354061698e-09, "logits/chosen": -1.4936548471450806, "logits/rejected": -1.1473426818847656, "logps/chosen": -198.7931671142578, "logps/rejected": -211.90811157226562, "loss": 0.6559, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.017037371173501015, "rewards/margins": 0.07735893130302429, "rewards/margins_max": 0.11480559408664703, "rewards/margins_min": 0.039912257343530655, "rewards/margins_std": 0.05295759439468384, "rewards/rejected": -0.06032155826687813, "step": 2550 }, { "epoch": 0.97, "grad_norm": 0.330078125, "learning_rate": 1.1396752298723499e-09, "logits/chosen": -1.4866702556610107, "logits/rejected": -1.0954639911651611, "logps/chosen": -204.0499725341797, "logps/rejected": -212.27188110351562, "loss": 0.6457, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.017862681299448013, "rewards/margins": 0.09754244983196259, "rewards/margins_max": 0.1323915421962738, "rewards/margins_min": 0.06269336491823196, "rewards/margins_std": 0.04928405210375786, "rewards/rejected": -0.07967977225780487, "step": 2560 }, { "epoch": 0.98, "grad_norm": 0.271484375, "learning_rate": 8.452493874266108e-10, "logits/chosen": -1.322525978088379, "logits/rejected": -1.0981214046478271, "logps/chosen": -210.36325073242188, "logps/rejected": -231.64987182617188, "loss": 0.6511, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.017954757437109947, "rewards/margins": 0.08609770238399506, "rewards/margins_max": 0.12132489681243896, "rewards/margins_min": 0.050870515406131744, "rewards/margins_std": 0.04981876164674759, "rewards/rejected": -0.06814294308423996, "step": 2570 }, { "epoch": 0.98, "grad_norm": 0.287109375, "learning_rate": 5.946765290827383e-10, "logits/chosen": -1.3004374504089355, "logits/rejected": -1.0841004848480225, "logps/chosen": -201.1930389404297, "logps/rejected": -225.4813690185547, "loss": 0.6504, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.020434178411960602, "rewards/margins": 0.08689124882221222, "rewards/margins_max": 0.1344389021396637, "rewards/margins_min": 0.039343591779470444, "rewards/margins_std": 0.06724254041910172, "rewards/rejected": -0.06645707041025162, "step": 2580 }, { "epoch": 0.98, "grad_norm": 0.3125, "learning_rate": 3.880007574218469e-10, "logits/chosen": -1.4542973041534424, "logits/rejected": -1.1403675079345703, "logps/chosen": -185.40380859375, "logps/rejected": -219.89501953125, "loss": 0.6496, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.028258394449949265, "rewards/margins": 0.09239298850297928, "rewards/margins_max": 0.1326712816953659, "rewards/margins_min": 0.05211470276117325, "rewards/margins_std": 0.05696210265159607, "rewards/rejected": -0.06413459777832031, "step": 2590 }, { "epoch": 0.99, "grad_norm": 0.34375, "learning_rate": 2.2525844882964606e-10, "logits/chosen": -1.496821403503418, "logits/rejected": -1.1975185871124268, "logps/chosen": -213.3788604736328, "logps/rejected": -249.6721649169922, "loss": 0.6505, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022496605291962624, "rewards/margins": 0.08701016753911972, "rewards/margins_max": 0.12534096837043762, "rewards/margins_min": 0.04867938160896301, "rewards/margins_std": 0.05420792102813721, "rewards/rejected": -0.06451357156038284, "step": 2600 }, { "epoch": 0.99, "grad_norm": 0.29296875, "learning_rate": 1.0647824709419939e-10, "logits/chosen": -1.4642287492752075, "logits/rejected": -1.195237398147583, "logps/chosen": -173.61244201660156, "logps/rejected": -194.322509765625, "loss": 0.649, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.026876743882894516, "rewards/margins": 0.10324034839868546, "rewards/margins_max": 0.14352624118328094, "rewards/margins_min": 0.06295443326234818, "rewards/margins_std": 0.05697287991642952, "rewards/rejected": -0.07636359333992004, "step": 2610 }, { "epoch": 1.0, "grad_norm": 0.34765625, "learning_rate": 3.168105836440227e-11, "logits/chosen": -1.430641770362854, "logits/rejected": -1.058257818222046, "logps/chosen": -219.7120819091797, "logps/rejected": -215.89663696289062, "loss": 0.6453, "rewards/accuracies": 1.0, "rewards/chosen": 0.01688862219452858, "rewards/margins": 0.095039002597332, "rewards/margins_max": 0.13273173570632935, "rewards/margins_min": 0.05734627693891525, "rewards/margins_std": 0.053305573761463165, "rewards/rejected": -0.07815037667751312, "step": 2620 }, { "epoch": 1.0, "grad_norm": 0.3203125, "learning_rate": 8.800474701475824e-13, "logits/chosen": -1.5000232458114624, "logits/rejected": -1.2522644996643066, "logps/chosen": -189.9681854248047, "logps/rejected": -215.1830291748047, "loss": 0.6522, "rewards/accuracies": 1.0, "rewards/chosen": 0.019782623276114464, "rewards/margins": 0.07797619700431824, "rewards/margins_max": 0.10702647268772125, "rewards/margins_min": 0.04892592877149582, "rewards/margins_std": 0.041083287447690964, "rewards/rejected": -0.058193571865558624, "step": 2630 }, { "epoch": 1.0, "eval_logits/chosen": -1.013566493988037, "eval_logits/rejected": -0.8866661190986633, "eval_logps/chosen": -326.8977355957031, "eval_logps/rejected": -315.0963439941406, "eval_loss": 0.6891594529151917, "eval_rewards/accuracies": 0.5820000171661377, "eval_rewards/chosen": 0.004229320678859949, "eval_rewards/margins": 0.008481495082378387, "eval_rewards/margins_max": 0.0640675351023674, "eval_rewards/margins_min": -0.04490014538168907, "eval_rewards/margins_std": 0.03645985573530197, "eval_rewards/rejected": -0.004252173937857151, "eval_runtime": 4330.3916, "eval_samples_per_second": 2.771, "eval_steps_per_second": 0.173, "step": 2632 }, { "epoch": 1.0, "step": 2632, "total_flos": 0.0, "train_loss": 0.658518510975374, "train_runtime": 42431.3226, "train_samples_per_second": 0.992, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 2632, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }