{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.8062015503875966, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 486.7500228881836, "epoch": 0.015503875968992248, "grad_norm": 0.26961565017700195, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": -0.0045, "reward": 0.0035714288242161274, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0357142873108387, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 466.6964416503906, "epoch": 0.031007751937984496, "grad_norm": 0.3279306888580322, "kl": 0.0, "learning_rate": 6.666666666666667e-07, "loss": 0.0129, "reward": 0.010714286705479026, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.10714286379516125, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 561.5714569091797, "epoch": 0.046511627906976744, "grad_norm": 0.14393426477909088, "kl": 0.0003495216369628906, "learning_rate": 1.0000000000000002e-06, "loss": -0.0023, "reward": 0.0017857144121080637, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.01785714365541935, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 548.8750228881836, "epoch": 0.06201550387596899, "grad_norm": 0.0009427572367712855, "kl": 0.00028252601623535156, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 497.7857360839844, "epoch": 0.07751937984496124, "grad_norm": 0.19416852295398712, "kl": 0.0003070831298828125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0211, "reward": 0.0035714288242161274, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0357142873108387, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 551.0893173217773, "epoch": 0.09302325581395349, "grad_norm": 0.2645115256309509, "kl": 0.00034809112548828125, "learning_rate": 2.0000000000000003e-06, "loss": 0.0314, "reward": 0.005357143469154835, "reward_std": 0.007576144300401211, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0535714328289032, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 525.3750152587891, "epoch": 0.10852713178294573, "grad_norm": 0.4168189764022827, "kl": 0.00038433074951171875, "learning_rate": 2.3333333333333336e-06, "loss": -0.0174, "reward": 0.012500000884756446, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.12500000558793545, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 449.35716247558594, "epoch": 0.12403100775193798, "grad_norm": 0.26261523365974426, "kl": 0.0006132125854492188, "learning_rate": 2.666666666666667e-06, "loss": -0.0101, "reward": 0.0035714288242161274, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0357142873108387, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 517.0178833007812, "epoch": 0.13953488372093023, "grad_norm": 0.38538143038749695, "kl": 0.001514434814453125, "learning_rate": 3e-06, "loss": 0.0096, "reward": 0.008928572293370962, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.0892857201397419, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 548.1964492797852, "epoch": 0.15503875968992248, "grad_norm": 0.2276693731546402, "kl": 0.0036163330078125, "learning_rate": 3.3333333333333333e-06, "loss": -0.006, "reward": 0.010714286705479026, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.10714286379516125, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 503.0714569091797, "epoch": 0.17054263565891473, "grad_norm": 0.45380452275276184, "kl": 0.0091552734375, "learning_rate": 3.6666666666666666e-06, "loss": 0.0442, "reward": 0.014285715529695153, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.14285715110599995, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 499.9285888671875, "epoch": 0.18604651162790697, "grad_norm": 0.4393024444580078, "kl": 0.0289306640625, "learning_rate": 4.000000000000001e-06, "loss": 0.0051, "reward": 0.028571431059390306, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.2857142947614193, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 444.10716247558594, "epoch": 0.20155038759689922, "grad_norm": 0.8636987209320068, "kl": 0.110595703125, "learning_rate": 4.333333333333334e-06, "loss": 0.0191, "reward": 0.026785716880112886, "reward_std": 0.03282995941117406, "rewards/code_reward": 0.0, "rewards/format_reward": 0.2678571604192257, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 417.57144927978516, "epoch": 0.21705426356589147, "grad_norm": 0.6144067645072937, "kl": 0.06768798828125, "learning_rate": 4.666666666666667e-06, "loss": 0.0193, "reward": 0.03928571753203869, "reward_std": 0.030304578132927418, "rewards/code_reward": 0.0, "rewards/format_reward": 0.3928571566939354, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 513.9107360839844, "epoch": 0.23255813953488372, "grad_norm": 0.5421202778816223, "kl": 0.043365478515625, "learning_rate": 5e-06, "loss": 0.0393, "reward": 0.04107143264263868, "reward_std": 0.03282995941117406, "rewards/code_reward": 0.0, "rewards/format_reward": 0.4107143059372902, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 529.9464492797852, "epoch": 0.24806201550387597, "grad_norm": 1.8159050941467285, "kl": 0.0850830078125, "learning_rate": 4.999952797253148e-06, "loss": 0.093, "reward": 0.03928571753203869, "reward_std": 0.04040610417723656, "rewards/code_reward": 0.0, "rewards/format_reward": 0.3928571566939354, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 457.10716247558594, "epoch": 0.26356589147286824, "grad_norm": 0.5563008189201355, "kl": 0.05609130859375, "learning_rate": 4.9998111909931225e-06, "loss": 0.0187, "reward": 0.03392857359722257, "reward_std": 0.022728432901203632, "rewards/code_reward": 0.0, "rewards/format_reward": 0.3392857201397419, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 478.19644927978516, "epoch": 0.27906976744186046, "grad_norm": 0.8018612861633301, "kl": 0.04034423828125, "learning_rate": 4.999575187161439e-06, "loss": 0.0407, "reward": 0.04642857518047094, "reward_std": 0.04545686719939113, "rewards/code_reward": 0.0, "rewards/format_reward": 0.4642857313156128, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 454.67860412597656, "epoch": 0.29457364341085274, "grad_norm": 0.4552249312400818, "kl": 0.04693603515625, "learning_rate": 4.9992447956603455e-06, "loss": 0.013, "reward": 0.055357146076858044, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5535714477300644, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 398.14288330078125, "epoch": 0.31007751937984496, "grad_norm": 0.6622409224510193, "kl": 0.07818603515625, "learning_rate": 4.998820030352409e-06, "loss": 0.0427, "reward": 0.06964286230504513, "reward_std": 0.027779196621850133, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6964286118745804, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 411.42859649658203, "epoch": 0.32558139534883723, "grad_norm": 0.5362735390663147, "kl": 0.0372314453125, "learning_rate": 4.998300909059929e-06, "loss": 0.0515, "reward": 0.07857143320143223, "reward_std": 0.025253815110772848, "rewards/code_reward": 0.0, "rewards/format_reward": 0.785714328289032, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 391.0000228881836, "epoch": 0.34108527131782945, "grad_norm": 0.5802915692329407, "kl": 0.0638427734375, "learning_rate": 4.997687453564198e-06, "loss": 0.0016, "reward": 0.06964286044239998, "reward_std": 0.02777919638901949, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6964285969734192, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 352.16072845458984, "epoch": 0.35658914728682173, "grad_norm": 0.6233690977096558, "kl": 0.1044921875, "learning_rate": 4.9969796896045775e-06, "loss": 0.0163, "reward": 0.07500000484287739, "reward_std": 0.02525381464511156, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7500000447034836, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 357.23216247558594, "epoch": 0.37209302325581395, "grad_norm": 0.5452607274055481, "kl": 0.07611083984375, "learning_rate": 4.996177646877426e-06, "loss": 0.0347, "reward": 0.0803571492433548, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 334.1428680419922, "epoch": 0.3875968992248062, "grad_norm": 0.2985096275806427, "kl": 0.03887939453125, "learning_rate": 4.995281359034851e-06, "loss": 0.0163, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 346.44644927978516, "epoch": 0.40310077519379844, "grad_norm": 0.5771266222000122, "kl": 0.145263671875, "learning_rate": 4.994290863683296e-06, "loss": 0.0187, "reward": 0.08750000782310963, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 342.21429443359375, "epoch": 0.4186046511627907, "grad_norm": 1.0314879417419434, "kl": 0.207275390625, "learning_rate": 4.99320620238196e-06, "loss": 0.0203, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 317.4107360839844, "epoch": 0.43410852713178294, "grad_norm": 0.205219104886055, "kl": 0.04510498046875, "learning_rate": 4.99202742064106e-06, "loss": -0.0048, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 315.37501525878906, "epoch": 0.4496124031007752, "grad_norm": 0.5641531944274902, "kl": 0.0771484375, "learning_rate": 4.990754567919917e-06, "loss": -0.0044, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 315.78572845458984, "epoch": 0.46511627906976744, "grad_norm": 0.15068137645721436, "kl": 0.044952392578125, "learning_rate": 4.989387697624881e-06, "loss": -0.0039, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 362.2500228881836, "epoch": 0.4806201550387597, "grad_norm": 0.42976486682891846, "kl": 0.083160400390625, "learning_rate": 4.987926867107095e-06, "loss": 0.0262, "reward": 0.08571429178118706, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 328.03572845458984, "epoch": 0.49612403100775193, "grad_norm": 0.3264504373073578, "kl": 0.0775146484375, "learning_rate": 4.986372137660078e-06, "loss": 0.0051, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 318.1071548461914, "epoch": 0.5116279069767442, "grad_norm": 0.29669874906539917, "kl": 0.0478515625, "learning_rate": 4.984723574517165e-06, "loss": 0.0067, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 298.25001525878906, "epoch": 0.5271317829457365, "grad_norm": 0.14518219232559204, "kl": 0.0523681640625, "learning_rate": 4.9829812468487655e-06, "loss": -0.0005, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 367.8393020629883, "epoch": 0.5426356589147286, "grad_norm": 0.2966887354850769, "kl": 0.0465087890625, "learning_rate": 4.981145227759457e-06, "loss": 0.0274, "reward": 0.09285714849829674, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714477300644, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 377.19644927978516, "epoch": 0.5581395348837209, "grad_norm": 0.2373329997062683, "kl": 0.04595947265625, "learning_rate": 4.979215594284924e-06, "loss": -0.0014, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 376.28572845458984, "epoch": 0.5736434108527132, "grad_norm": 0.14644701778888702, "kl": 0.04345703125, "learning_rate": 4.977192427388722e-06, "loss": 0.0002, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 360.6964416503906, "epoch": 0.5891472868217055, "grad_norm": 0.1754084676504135, "kl": 0.04449462890625, "learning_rate": 4.9750758119588824e-06, "loss": -0.0018, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 420.2857437133789, "epoch": 0.6046511627906976, "grad_norm": 0.35420262813568115, "kl": 0.05120849609375, "learning_rate": 4.972865836804349e-06, "loss": 0.0194, "reward": 0.08928572200238705, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 306.25000762939453, "epoch": 0.6201550387596899, "grad_norm": 0.1385747194290161, "kl": 0.03955078125, "learning_rate": 4.970562594651254e-06, "loss": 0.0075, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 383.6607360839844, "epoch": 0.6356589147286822, "grad_norm": 0.17503131926059723, "kl": 0.03948974609375, "learning_rate": 4.968166182139026e-06, "loss": 0.0062, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 440.57144927978516, "epoch": 0.6511627906976745, "grad_norm": 0.1598208099603653, "kl": 0.04473876953125, "learning_rate": 4.9656766998163306e-06, "loss": -0.0023, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 337.1428680419922, "epoch": 0.6666666666666666, "grad_norm": 0.23992463946342468, "kl": 0.05145263671875, "learning_rate": 4.963094252136865e-06, "loss": -0.0088, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 359.94644927978516, "epoch": 0.6821705426356589, "grad_norm": 0.30762046575546265, "kl": 0.048828125, "learning_rate": 4.960418947454958e-06, "loss": -0.007, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 370.23216247558594, "epoch": 0.6976744186046512, "grad_norm": 0.15281742811203003, "kl": 0.04937744140625, "learning_rate": 4.957650898021038e-06, "loss": -0.0022, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 393.32144927978516, "epoch": 0.7131782945736435, "grad_norm": 0.1467144638299942, "kl": 0.04290771484375, "learning_rate": 4.954790219976915e-06, "loss": -0.0129, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 395.44644927978516, "epoch": 0.7286821705426356, "grad_norm": 0.24572958052158356, "kl": 0.04949951171875, "learning_rate": 4.95183703335091e-06, "loss": 0.0161, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 401.5357360839844, "epoch": 0.7441860465116279, "grad_norm": 0.2575523853302002, "kl": 0.052490234375, "learning_rate": 4.948791462052819e-06, "loss": -0.014, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 385.5178680419922, "epoch": 0.7596899224806202, "grad_norm": 0.14166215062141418, "kl": 0.06591796875, "learning_rate": 4.945653633868716e-06, "loss": 0.0074, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 411.2500228881836, "epoch": 0.7751937984496124, "grad_norm": 0.2869671583175659, "kl": 0.04833984375, "learning_rate": 4.942423680455584e-06, "loss": 0.0156, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 387.4643020629883, "epoch": 0.7906976744186046, "grad_norm": 0.30038827657699585, "kl": 0.061279296875, "learning_rate": 4.939101737335802e-06, "loss": -0.0201, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 398.2678756713867, "epoch": 0.8062015503875969, "grad_norm": 0.2649737596511841, "kl": 0.08355712890625, "learning_rate": 4.935687943891447e-06, "loss": 0.0014, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 345.3214340209961, "epoch": 0.8217054263565892, "grad_norm": 0.17817121744155884, "kl": 0.06097412109375, "learning_rate": 4.932182443358458e-06, "loss": 0.002, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 378.82144927978516, "epoch": 0.8372093023255814, "grad_norm": 0.2614600360393524, "kl": 0.05889892578125, "learning_rate": 4.928585382820616e-06, "loss": 0.002, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 395.3035888671875, "epoch": 0.8527131782945736, "grad_norm": 0.19784440100193024, "kl": 0.0565185546875, "learning_rate": 4.924896913203376e-06, "loss": 0.0135, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 370.87501525878906, "epoch": 0.8682170542635659, "grad_norm": 0.2502836585044861, "kl": 0.05908203125, "learning_rate": 4.921117189267535e-06, "loss": 0.0157, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 391.4285888671875, "epoch": 0.8837209302325582, "grad_norm": 0.18611028790473938, "kl": 0.06365966796875, "learning_rate": 4.917246369602742e-06, "loss": -0.0074, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 406.4643020629883, "epoch": 0.8992248062015504, "grad_norm": 0.23732154071331024, "kl": 0.0538330078125, "learning_rate": 4.9132846166208355e-06, "loss": 0.0058, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 449.94644927978516, "epoch": 0.9147286821705426, "grad_norm": 0.1505957543849945, "kl": 0.046630859375, "learning_rate": 4.9092320965490365e-06, "loss": 0.0166, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 458.14288330078125, "epoch": 0.9302325581395349, "grad_norm": 0.22972935438156128, "kl": 0.04986572265625, "learning_rate": 4.905088979422971e-06, "loss": 0.0175, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 334.3035888671875, "epoch": 0.9457364341085271, "grad_norm": 0.3014618158340454, "kl": 0.0665283203125, "learning_rate": 4.900855439079536e-06, "loss": -0.003, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 461.3750228881836, "epoch": 0.9612403100775194, "grad_norm": 0.22455042600631714, "kl": 0.059326171875, "learning_rate": 4.8965316531496055e-06, "loss": 0.0138, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 458.57144927978516, "epoch": 0.9767441860465116, "grad_norm": 0.12987832725048065, "kl": 0.05975341796875, "learning_rate": 4.892117803050578e-06, "loss": 0.0333, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 380.4375, "epoch": 0.9922480620155039, "grad_norm": 0.029962124302983284, "kl": 0.0517578125, "learning_rate": 4.887614073978761e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 375.07144927978516, "epoch": 1.0155038759689923, "grad_norm": 0.20877622067928314, "kl": 0.06829833984375, "learning_rate": 4.883020654901609e-06, "loss": -0.0029, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 395.5178756713867, "epoch": 1.0310077519379846, "grad_norm": 0.2041924148797989, "kl": 0.06427001953125, "learning_rate": 4.878337738549785e-06, "loss": -0.0086, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 385.44644927978516, "epoch": 1.0465116279069768, "grad_norm": 0.1686294972896576, "kl": 0.044921875, "learning_rate": 4.873565521409082e-06, "loss": 0.0128, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 349.25001525878906, "epoch": 1.062015503875969, "grad_norm": 0.22423683106899261, "kl": 0.0953369140625, "learning_rate": 4.868704203712173e-06, "loss": -0.0024, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 441.6964416503906, "epoch": 1.0775193798449612, "grad_norm": 0.335616797208786, "kl": 0.0814208984375, "learning_rate": 4.86375398943021e-06, "loss": 0.0239, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 358.3928756713867, "epoch": 1.0930232558139534, "grad_norm": 0.019625969231128693, "kl": 0.044921875, "learning_rate": 4.858715086264274e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 369.2500228881836, "epoch": 1.1085271317829457, "grad_norm": 0.017356975004076958, "kl": 0.04278564453125, "learning_rate": 4.853587705636646e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 415.2678756713867, "epoch": 1.124031007751938, "grad_norm": 1.0988541841506958, "kl": 0.21514892578125, "learning_rate": 4.84837206268195e-06, "loss": -0.0002, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 401.60716247558594, "epoch": 1.1395348837209303, "grad_norm": 0.07964562624692917, "kl": 0.0548095703125, "learning_rate": 4.8430683762381195e-06, "loss": 0.0181, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 433.8571548461914, "epoch": 1.1550387596899225, "grad_norm": 0.3271082043647766, "kl": 0.067138671875, "learning_rate": 4.837676868837213e-06, "loss": 0.0282, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 473.7500228881836, "epoch": 1.1705426356589148, "grad_norm": 0.11898969113826752, "kl": 0.04522705078125, "learning_rate": 4.832197766696085e-06, "loss": 0.0467, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 332.6250228881836, "epoch": 1.1860465116279069, "grad_norm": 0.21207605302333832, "kl": 0.08026123046875, "learning_rate": 4.826631299706887e-06, "loss": -0.0032, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 379.4285888671875, "epoch": 1.2015503875968991, "grad_norm": 0.22306586802005768, "kl": 0.062255859375, "learning_rate": 4.820977701427424e-06, "loss": 0.0056, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 338.7143020629883, "epoch": 1.2170542635658914, "grad_norm": 0.16933457553386688, "kl": 0.064208984375, "learning_rate": 4.81523720907136e-06, "loss": -0.0063, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 362.8393020629883, "epoch": 1.2325581395348837, "grad_norm": 0.8381237983703613, "kl": 0.0596923828125, "learning_rate": 4.809410063498254e-06, "loss": 0.0006, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 414.0714416503906, "epoch": 1.248062015503876, "grad_norm": 0.20738717913627625, "kl": 0.05206298828125, "learning_rate": 4.8034965092034656e-06, "loss": 0.0313, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 407.4285888671875, "epoch": 1.2635658914728682, "grad_norm": 0.3369165062904358, "kl": 0.08013916015625, "learning_rate": 4.797496794307889e-06, "loss": 0.0061, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 408.1071548461914, "epoch": 1.2790697674418605, "grad_norm": 6.407706260681152, "kl": 1.0523681640625, "learning_rate": 4.791411170547545e-06, "loss": 0.0007, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 340.71429443359375, "epoch": 1.2945736434108528, "grad_norm": 0.1786692589521408, "kl": 0.05548095703125, "learning_rate": 4.785239893263017e-06, "loss": 0.0136, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 381.69644927978516, "epoch": 1.310077519379845, "grad_norm": 0.06986937671899796, "kl": 0.0423583984375, "learning_rate": 4.778983221388742e-06, "loss": 0.0186, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 395.60716247558594, "epoch": 1.3255813953488373, "grad_norm": 1.0251904726028442, "kl": 0.088134765625, "learning_rate": 4.77264141744214e-06, "loss": -0.0158, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 360.3393020629883, "epoch": 1.3410852713178294, "grad_norm": 0.41934439539909363, "kl": 0.08447265625, "learning_rate": 4.766214747512603e-06, "loss": -0.009, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 382.8214416503906, "epoch": 1.3565891472868217, "grad_norm": 0.2629234194755554, "kl": 0.05426025390625, "learning_rate": 4.759703481250331e-06, "loss": 0.0143, "reward": 0.09464286454021931, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 319.1607360839844, "epoch": 1.372093023255814, "grad_norm": 0.4782038629055023, "kl": 0.085693359375, "learning_rate": 4.753107891855015e-06, "loss": -0.0169, "reward": 0.08571429178118706, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 361.7678680419922, "epoch": 1.3875968992248062, "grad_norm": 0.3007795512676239, "kl": 0.0552978515625, "learning_rate": 4.746428256064375e-06, "loss": 0.0267, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 513.6785888671875, "epoch": 1.4031007751937985, "grad_norm": 0.37290942668914795, "kl": 0.06085205078125, "learning_rate": 4.7396648541425534e-06, "loss": 0.0452, "reward": 0.0857142936438322, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 487.2143020629883, "epoch": 1.4186046511627908, "grad_norm": 0.27805617451667786, "kl": 0.06781005859375, "learning_rate": 4.732817969868348e-06, "loss": 0.0474, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 458.76788330078125, "epoch": 1.4341085271317828, "grad_norm": 0.23480121791362762, "kl": 0.0584716796875, "learning_rate": 4.7258878905233095e-06, "loss": 0.0453, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 450.4643096923828, "epoch": 1.449612403100775, "grad_norm": 0.3803044855594635, "kl": 0.0882568359375, "learning_rate": 4.718874906879688e-06, "loss": 0.0698, "reward": 0.08214286155998707, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 431.6071548461914, "epoch": 1.4651162790697674, "grad_norm": 0.2859114110469818, "kl": 0.05865478515625, "learning_rate": 4.711779313188231e-06, "loss": 0.0484, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 482.3750228881836, "epoch": 1.4806201550387597, "grad_norm": 0.27567797899246216, "kl": 0.06036376953125, "learning_rate": 4.70460140716584e-06, "loss": 0.0909, "reward": 0.08750000782310963, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 369.0714416503906, "epoch": 1.496124031007752, "grad_norm": 0.29379433393478394, "kl": 0.05914306640625, "learning_rate": 4.697341489983076e-06, "loss": 0.0258, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 355.6428756713867, "epoch": 1.5116279069767442, "grad_norm": 0.34517690539360046, "kl": 0.0787353515625, "learning_rate": 4.6899998662515215e-06, "loss": 0.0207, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 428.80360412597656, "epoch": 1.5271317829457365, "grad_norm": 0.21387967467308044, "kl": 0.05517578125, "learning_rate": 4.682576844011007e-06, "loss": 0.0527, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 445.87501525878906, "epoch": 1.5426356589147288, "grad_norm": 0.27787187695503235, "kl": 0.06585693359375, "learning_rate": 4.675072734716678e-06, "loss": 0.0585, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 521.1071701049805, "epoch": 1.558139534883721, "grad_norm": 0.2258799970149994, "kl": 0.0574951171875, "learning_rate": 4.667487853225931e-06, "loss": 0.0816, "reward": 0.08928572200238705, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 500.60716247558594, "epoch": 1.5736434108527133, "grad_norm": 0.18799901008605957, "kl": 0.05963134765625, "learning_rate": 4.659822517785203e-06, "loss": 0.0686, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 431.82144927978516, "epoch": 1.5891472868217056, "grad_norm": 0.33642229437828064, "kl": 0.0623779296875, "learning_rate": 4.6520770500166165e-06, "loss": 0.022, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 418.5357360839844, "epoch": 1.6046511627906976, "grad_norm": 0.17402252554893494, "kl": 0.05926513671875, "learning_rate": 4.644251774904487e-06, "loss": 0.0472, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 341.46429443359375, "epoch": 1.62015503875969, "grad_norm": 0.16139744222164154, "kl": 0.05474853515625, "learning_rate": 4.636347020781684e-06, "loss": 0.0078, "reward": 0.09464286640286446, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 376.16072845458984, "epoch": 1.6356589147286822, "grad_norm": 0.20297406613826752, "kl": 0.16015625, "learning_rate": 4.6283631193158605e-06, "loss": -0.0391, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 307.9285888671875, "epoch": 1.6511627906976745, "grad_norm": 0.02185610495507717, "kl": 0.0635986328125, "learning_rate": 4.620300405495532e-06, "loss": 0.0006, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 427.39288330078125, "epoch": 1.6666666666666665, "grad_norm": 0.3790690004825592, "kl": 0.08642578125, "learning_rate": 4.612159217616022e-06, "loss": 0.0327, "reward": 0.08750000409781933, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 395.0893020629883, "epoch": 1.6821705426356588, "grad_norm": 0.1745668351650238, "kl": 0.05218505859375, "learning_rate": 4.603939897265268e-06, "loss": 0.0428, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 332.05358123779297, "epoch": 1.697674418604651, "grad_norm": 0.1193130612373352, "kl": 0.0545654296875, "learning_rate": 4.595642789309492e-06, "loss": 0.0111, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 355.9821548461914, "epoch": 1.7131782945736433, "grad_norm": 0.20527540147304535, "kl": 0.07867431640625, "learning_rate": 4.587268241878724e-06, "loss": 0.0454, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 389.0357322692871, "epoch": 1.7286821705426356, "grad_norm": 0.08345554023981094, "kl": 0.06036376953125, "learning_rate": 4.578816606352205e-06, "loss": 0.0398, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 408.4643096923828, "epoch": 1.744186046511628, "grad_norm": 0.1863243132829666, "kl": 0.06109619140625, "learning_rate": 4.570288237343632e-06, "loss": 0.0374, "reward": 0.09107143804430962, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 330.6428680419922, "epoch": 1.7596899224806202, "grad_norm": 0.03056999109685421, "kl": 0.0721435546875, "learning_rate": 4.561683492686289e-06, "loss": 0.0007, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 331.5714416503906, "epoch": 1.7751937984496124, "grad_norm": 0.23037225008010864, "kl": 0.06451416015625, "learning_rate": 4.5530027334180285e-06, "loss": -0.0047, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 312.2678756713867, "epoch": 1.7906976744186047, "grad_norm": 0.03742313012480736, "kl": 0.0576171875, "learning_rate": 4.544246323766122e-06, "loss": 0.0006, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 488.35716247558594, "epoch": 1.806201550387597, "grad_norm": 0.2483549565076828, "kl": 0.05865478515625, "learning_rate": 4.535414631131983e-06, "loss": 0.036, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 379.8035888671875, "epoch": 1.8217054263565893, "grad_norm": 0.30526259541511536, "kl": 0.08013916015625, "learning_rate": 4.526508026075746e-06, "loss": 0.0156, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 341.1428756713867, "epoch": 1.8372093023255816, "grad_norm": 0.02339295670390129, "kl": 0.056884765625, "learning_rate": 4.517526882300721e-06, "loss": 0.0006, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 388.2143020629883, "epoch": 1.8527131782945736, "grad_norm": 0.3421875834465027, "kl": 0.0506591796875, "learning_rate": 4.508471576637713e-06, "loss": 0.037, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 325.6964416503906, "epoch": 1.8682170542635659, "grad_norm": 0.0237069521099329, "kl": 0.04962158203125, "learning_rate": 4.499342489029211e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 364.0714416503906, "epoch": 1.8837209302325582, "grad_norm": 0.2091287523508072, "kl": 0.0748291015625, "learning_rate": 4.490140002513449e-06, "loss": 0.0171, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 380.4107360839844, "epoch": 1.8992248062015504, "grad_norm": 0.07675088196992874, "kl": 0.05084228515625, "learning_rate": 4.48086450320833e-06, "loss": 0.0214, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 329.1428756713867, "epoch": 1.9147286821705425, "grad_norm": 0.017835261300206184, "kl": 0.04571533203125, "learning_rate": 4.4715163802952266e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 356.7321548461914, "epoch": 1.9302325581395348, "grad_norm": 3.949739694595337, "kl": 0.5897216796875, "learning_rate": 4.462096026002655e-06, "loss": 0.0059, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 383.5893020629883, "epoch": 1.945736434108527, "grad_norm": 0.06876012682914734, "kl": 0.05108642578125, "learning_rate": 4.4526038355898144e-06, "loss": 0.0192, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 342.75001525878906, "epoch": 1.9612403100775193, "grad_norm": 0.01601524092257023, "kl": 0.0462646484375, "learning_rate": 4.4430402073300035e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 340.2143020629883, "epoch": 1.9767441860465116, "grad_norm": 0.01760284975171089, "kl": 0.04705810546875, "learning_rate": 4.433405542493909e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 425.6875, "epoch": 1.9922480620155039, "grad_norm": 0.30735570192337036, "kl": 0.07025146484375, "learning_rate": 4.4237002453327734e-06, "loss": -0.0102, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 398.1964569091797, "epoch": 2.0155038759689923, "grad_norm": 0.2743019163608551, "kl": 0.05218505859375, "learning_rate": 4.4139247230614245e-06, "loss": 0.012, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 355.48216247558594, "epoch": 2.0310077519379846, "grad_norm": 0.02317599020898342, "kl": 0.05474853515625, "learning_rate": 4.404079385841201e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 440.14288330078125, "epoch": 2.046511627906977, "grad_norm": 0.12499076128005981, "kl": 0.060546875, "learning_rate": 4.394164646762734e-06, "loss": 0.0395, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 349.6607360839844, "epoch": 2.062015503875969, "grad_norm": 0.3491656482219696, "kl": 0.0546875, "learning_rate": 4.384180921828618e-06, "loss": -0.0162, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 356.9285888671875, "epoch": 2.0775193798449614, "grad_norm": 0.15453527867794037, "kl": 0.04852294921875, "learning_rate": 4.374128629935955e-06, "loss": 0.0289, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 331.9464416503906, "epoch": 2.0930232558139537, "grad_norm": 0.016740234568715096, "kl": 0.0595703125, "learning_rate": 4.364008192858781e-06, "loss": 0.0006, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 367.85716247558594, "epoch": 2.108527131782946, "grad_norm": 0.021411418914794922, "kl": 0.052001953125, "learning_rate": 4.353820035230366e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 333.62501525878906, "epoch": 2.124031007751938, "grad_norm": 0.01767720840871334, "kl": 0.054443359375, "learning_rate": 4.3435645845254e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 379.8393096923828, "epoch": 2.13953488372093, "grad_norm": 0.041266124695539474, "kl": 0.0701904296875, "learning_rate": 4.333242271042054e-06, "loss": 0.0007, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 366.1071548461914, "epoch": 2.1550387596899223, "grad_norm": 0.20214009284973145, "kl": 0.0625, "learning_rate": 4.32285352788393e-06, "loss": 0.0047, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 403.25001525878906, "epoch": 2.1705426356589146, "grad_norm": 0.14242787659168243, "kl": 0.0548095703125, "learning_rate": 4.312398790941882e-06, "loss": 0.0036, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 381.4464416503906, "epoch": 2.186046511627907, "grad_norm": 0.2580341100692749, "kl": 0.0653076171875, "learning_rate": 4.301878498875735e-06, "loss": 0.005, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 327.6428680419922, "epoch": 2.201550387596899, "grad_norm": 0.2158711850643158, "kl": 0.05181884765625, "learning_rate": 4.291293093095873e-06, "loss": -0.0069, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 352.7321548461914, "epoch": 2.2170542635658914, "grad_norm": 0.12166262418031693, "kl": 0.0693359375, "learning_rate": 4.280643017744723e-06, "loss": 0.0219, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 348.8393020629883, "epoch": 2.2325581395348837, "grad_norm": 0.07379510253667831, "kl": 0.06195068359375, "learning_rate": 4.269928719678117e-06, "loss": 0.0219, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 338.75001525878906, "epoch": 2.248062015503876, "grad_norm": 0.014747419394552708, "kl": 0.04913330078125, "learning_rate": 4.2591506484465426e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 440.62501525878906, "epoch": 2.2635658914728682, "grad_norm": 0.09444686770439148, "kl": 0.0557861328125, "learning_rate": 4.248309256276283e-06, "loss": 0.029, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 300.6785888671875, "epoch": 2.2790697674418605, "grad_norm": 0.018641607835888863, "kl": 0.05596923828125, "learning_rate": 4.23740499805044e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 366.60716247558594, "epoch": 2.294573643410853, "grad_norm": 0.06887350976467133, "kl": 0.04541015625, "learning_rate": 4.22643833128985e-06, "loss": 0.0169, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 416.73216247558594, "epoch": 2.310077519379845, "grad_norm": 0.06718391925096512, "kl": 0.05230712890625, "learning_rate": 4.215409716133885e-06, "loss": 0.0397, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 381.0000228881836, "epoch": 2.3255813953488373, "grad_norm": 0.1807098686695099, "kl": 0.05438232421875, "learning_rate": 4.204319615321151e-06, "loss": -0.0073, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 390.5357360839844, "epoch": 2.3410852713178296, "grad_norm": 0.1976163387298584, "kl": 0.07647705078125, "learning_rate": 4.193168494170065e-06, "loss": 0.0157, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 450.6428680419922, "epoch": 2.356589147286822, "grad_norm": 0.06315501034259796, "kl": 0.0489501953125, "learning_rate": 4.181956820559339e-06, "loss": 0.0366, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 371.6785888671875, "epoch": 2.3720930232558137, "grad_norm": 0.05957993492484093, "kl": 0.04547119140625, "learning_rate": 4.170685064908342e-06, "loss": 0.0189, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 372.7143020629883, "epoch": 2.387596899224806, "grad_norm": 0.011611810885369778, "kl": 0.04443359375, "learning_rate": 4.159353700157365e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 372.6250228881836, "epoch": 2.4031007751937983, "grad_norm": 0.03919665887951851, "kl": 0.0506591796875, "learning_rate": 4.14796320174778e-06, "loss": 0.0189, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 329.1428680419922, "epoch": 2.4186046511627906, "grad_norm": 0.0129386056214571, "kl": 0.0457763671875, "learning_rate": 4.136514047602087e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 417.12501525878906, "epoch": 2.434108527131783, "grad_norm": 0.08656369149684906, "kl": 0.0552978515625, "learning_rate": 4.1250067181038635e-06, "loss": 0.0549, "reward": 0.09464286454021931, "reward_std": 0.007576144300401211, "rewards/code_reward": 0.0, "rewards/format_reward": 0.946428582072258, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 356.2321586608887, "epoch": 2.449612403100775, "grad_norm": 0.09912148863077164, "kl": 0.06451416015625, "learning_rate": 4.113441696077608e-06, "loss": 0.0215, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 381.1964416503906, "epoch": 2.4651162790697674, "grad_norm": 0.16969716548919678, "kl": 0.05877685546875, "learning_rate": 4.101819466768484e-06, "loss": 0.0143, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 333.3214416503906, "epoch": 2.4806201550387597, "grad_norm": 0.011440815404057503, "kl": 0.04791259765625, "learning_rate": 4.0901405178219535e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 350.73216247558594, "epoch": 2.496124031007752, "grad_norm": 0.036816567182540894, "kl": 0.04779052734375, "learning_rate": 4.078405339263326e-06, "loss": 0.0217, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 361.5535888671875, "epoch": 2.511627906976744, "grad_norm": 0.026230594143271446, "kl": 0.0477294921875, "learning_rate": 4.06661442347719e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 341.0178680419922, "epoch": 2.5271317829457365, "grad_norm": 0.012676913291215897, "kl": 0.04327392578125, "learning_rate": 4.054768265186758e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 347.12501525878906, "epoch": 2.5426356589147288, "grad_norm": 0.012835390865802765, "kl": 0.04644775390625, "learning_rate": 4.0428673614331036e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 398.5178756713867, "epoch": 2.558139534883721, "grad_norm": 0.13431993126869202, "kl": 0.0458984375, "learning_rate": 4.030912211554316e-06, "loss": 0.0172, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 382.1607208251953, "epoch": 2.5736434108527133, "grad_norm": 0.1748451292514801, "kl": 0.04718017578125, "learning_rate": 4.018903317164539e-06, "loss": 0.0086, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 461.58931732177734, "epoch": 2.5891472868217056, "grad_norm": 0.2945731580257416, "kl": 0.0743408203125, "learning_rate": 4.006841182132932e-06, "loss": 0.0594, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 396.3393096923828, "epoch": 2.604651162790698, "grad_norm": 0.04062338173389435, "kl": 0.0452880859375, "learning_rate": 3.9947263125625195e-06, "loss": 0.0198, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 416.41072845458984, "epoch": 2.62015503875969, "grad_norm": 136.00784301757812, "kl": 1.88348388671875, "learning_rate": 3.982559216768967e-06, "loss": 0.07, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 289.2678680419922, "epoch": 2.6356589147286824, "grad_norm": 0.021940352395176888, "kl": 0.056640625, "learning_rate": 3.970340405259245e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 371.0178756713867, "epoch": 2.6511627906976747, "grad_norm": 0.24385367333889008, "kl": 0.12432861328125, "learning_rate": 3.958070390710214e-06, "loss": 0.0023, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 343.51787185668945, "epoch": 2.6666666666666665, "grad_norm": 0.048726681619882584, "kl": 0.0433349609375, "learning_rate": 3.945749687947109e-06, "loss": 0.0171, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 370.1607360839844, "epoch": 2.682170542635659, "grad_norm": 0.16867610812187195, "kl": 0.06219482421875, "learning_rate": 3.933378813921942e-06, "loss": -0.0057, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 349.2857360839844, "epoch": 2.697674418604651, "grad_norm": 0.23669791221618652, "kl": 0.11749267578125, "learning_rate": 3.920958287691811e-06, "loss": -0.0063, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 339.57144927978516, "epoch": 2.7131782945736433, "grad_norm": 0.04533864185214043, "kl": 0.0574951171875, "learning_rate": 3.908488630397121e-06, "loss": 0.0006, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 309.00001525878906, "epoch": 2.7286821705426356, "grad_norm": 0.08899213373661041, "kl": 0.05792236328125, "learning_rate": 3.8959703652397175e-06, "loss": 0.0087, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 377.30359649658203, "epoch": 2.744186046511628, "grad_norm": 0.05321886017918587, "kl": 0.05023193359375, "learning_rate": 3.883404017460935e-06, "loss": 0.0179, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 440.39288330078125, "epoch": 2.75968992248062, "grad_norm": 0.07496553659439087, "kl": 0.0521240234375, "learning_rate": 3.870790114319559e-06, "loss": 0.0422, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 424.0357360839844, "epoch": 2.7751937984496124, "grad_norm": 0.24973690509796143, "kl": 0.05303955078125, "learning_rate": 3.858129185069701e-06, "loss": 0.0246, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 346.25001525878906, "epoch": 2.7906976744186047, "grad_norm": 0.048278991132974625, "kl": 0.07000732421875, "learning_rate": 3.845421760938597e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 346.9464416503906, "epoch": 2.806201550387597, "grad_norm": 0.09487508982419968, "kl": 0.07293701171875, "learning_rate": 3.832668375104312e-06, "loss": 0.0159, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 357.0535888671875, "epoch": 2.8217054263565893, "grad_norm": 0.013098032213747501, "kl": 0.04278564453125, "learning_rate": 3.8198695626733725e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 311.6607360839844, "epoch": 2.8372093023255816, "grad_norm": 0.013701863586902618, "kl": 0.0498046875, "learning_rate": 3.8070258606583156e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 418.73216247558594, "epoch": 2.8527131782945734, "grad_norm": 0.0741962194442749, "kl": 0.06573486328125, "learning_rate": 3.7941378079551544e-06, "loss": 0.0418, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 438.21429443359375, "epoch": 2.8682170542635657, "grad_norm": 0.08349604904651642, "kl": 0.05194091796875, "learning_rate": 3.7812059453207677e-06, "loss": 0.0336, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 491.51788330078125, "epoch": 2.883720930232558, "grad_norm": 0.17697231471538544, "kl": 0.05194091796875, "learning_rate": 3.768230815350213e-06, "loss": 0.0333, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 363.3571548461914, "epoch": 2.89922480620155, "grad_norm": 0.012233450077474117, "kl": 0.0384521484375, "learning_rate": 3.7552129624539557e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 374.66072845458984, "epoch": 2.9147286821705425, "grad_norm": 0.09323134273290634, "kl": 0.04193115234375, "learning_rate": 3.7421529328350316e-06, "loss": 0.018, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 343.25001525878906, "epoch": 2.9302325581395348, "grad_norm": 0.17554545402526855, "kl": 0.05010986328125, "learning_rate": 3.7290512744661274e-06, "loss": -0.0053, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 346.7857208251953, "epoch": 2.945736434108527, "grad_norm": 0.012803681194782257, "kl": 0.041259765625, "learning_rate": 3.715908537066589e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 393.5357360839844, "epoch": 2.9612403100775193, "grad_norm": 0.048317961394786835, "kl": 0.04132080078125, "learning_rate": 3.7027252720793538e-06, "loss": 0.016, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 422.7143096923828, "epoch": 2.9767441860465116, "grad_norm": 0.6855605244636536, "kl": 0.22027587890625, "learning_rate": 3.689502032647817e-06, "loss": 0.0154, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 356.15625, "epoch": 2.992248062015504, "grad_norm": 0.014512370340526104, "kl": 0.04461669921875, "learning_rate": 3.6762393735926245e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 416.5893020629883, "epoch": 3.0155038759689923, "grad_norm": 0.39832741022109985, "kl": 0.10870361328125, "learning_rate": 3.6629378513883852e-06, "loss": 0.0074, "reward": 0.09107143618166447, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 366.1964416503906, "epoch": 3.0310077519379846, "grad_norm": 0.19380412995815277, "kl": 0.048828125, "learning_rate": 3.6495980241403307e-06, "loss": -0.0012, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 428.7321548461914, "epoch": 3.046511627906977, "grad_norm": 0.2720411717891693, "kl": 0.0596923828125, "learning_rate": 3.636220451560896e-06, "loss": 0.0191, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 377.3035888671875, "epoch": 3.062015503875969, "grad_norm": 0.06497833132743835, "kl": 0.0489501953125, "learning_rate": 3.622805694946235e-06, "loss": 0.013, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 473.8035888671875, "epoch": 3.0775193798449614, "grad_norm": 0.016108330339193344, "kl": 0.04351806640625, "learning_rate": 3.609354317152667e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 443.94644927978516, "epoch": 3.0930232558139537, "grad_norm": 9.35707950592041, "kl": 0.8321533203125, "learning_rate": 3.595866882573063e-06, "loss": 0.0221, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 404.50001525878906, "epoch": 3.108527131782946, "grad_norm": 0.013096613809466362, "kl": 0.03851318359375, "learning_rate": 3.5823439571131675e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 449.2678756713867, "epoch": 3.124031007751938, "grad_norm": 0.14225821197032928, "kl": 0.04180908203125, "learning_rate": 3.5687861081678477e-06, "loss": 0.0035, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 483.92859649658203, "epoch": 3.13953488372093, "grad_norm": 0.18597643077373505, "kl": 0.045745849609375, "learning_rate": 3.555193904597291e-06, "loss": 0.0368, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 385.9643096923828, "epoch": 3.1550387596899223, "grad_norm": 0.29447314143180847, "kl": 0.12945556640625, "learning_rate": 3.541567916703138e-06, "loss": -0.006, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 453.83931732177734, "epoch": 3.1705426356589146, "grad_norm": 0.1709349900484085, "kl": 0.0709228515625, "learning_rate": 3.5279087162045517e-06, "loss": 0.0165, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 423.6964416503906, "epoch": 3.186046511627907, "grad_norm": 0.28060221672058105, "kl": 0.04339599609375, "learning_rate": 3.5142168762142265e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 383.78572845458984, "epoch": 3.201550387596899, "grad_norm": 0.06871840357780457, "kl": 0.037109375, "learning_rate": 3.500492971214347e-06, "loss": 0.0126, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 422.1964569091797, "epoch": 3.2170542635658914, "grad_norm": 0.15468138456344604, "kl": 0.04852294921875, "learning_rate": 3.48673757703248e-06, "loss": 0.0165, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 365.0893020629883, "epoch": 3.2325581395348837, "grad_norm": 0.18128041923046112, "kl": 0.04852294921875, "learning_rate": 3.472951270817418e-06, "loss": -0.002, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 370.8393096923828, "epoch": 3.248062015503876, "grad_norm": 0.05891520529985428, "kl": 0.0443115234375, "learning_rate": 3.4591346310149578e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 392.67859649658203, "epoch": 3.2635658914728682, "grad_norm": 0.03135214000940323, "kl": 0.04595947265625, "learning_rate": 3.445288237343632e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 481.7143096923828, "epoch": 3.2790697674418605, "grad_norm": 0.07085608690977097, "kl": 0.04840087890625, "learning_rate": 3.4314126707703895e-06, "loss": 0.0141, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 439.2678756713867, "epoch": 3.294573643410853, "grad_norm": 0.2375049591064453, "kl": 0.04571533203125, "learning_rate": 3.4175085134862128e-06, "loss": 0.0463, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 446.10716247558594, "epoch": 3.310077519379845, "grad_norm": 0.11966075003147125, "kl": 0.0457763671875, "learning_rate": 3.4035763488816953e-06, "loss": 0.0118, "reward": 0.09464286454021931, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 461.8393096923828, "epoch": 3.3255813953488373, "grad_norm": 0.22044996917247772, "kl": 0.05328369140625, "learning_rate": 3.3896167615225594e-06, "loss": 0.003, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 445.7857360839844, "epoch": 3.3410852713178296, "grad_norm": 0.12458500266075134, "kl": 0.0550537109375, "learning_rate": 3.375630337125133e-06, "loss": 0.0142, "reward": 0.09464286640286446, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 409.30358123779297, "epoch": 3.356589147286822, "grad_norm": 0.4201054871082306, "kl": 0.03924560546875, "learning_rate": 3.361617662531772e-06, "loss": 0.0185, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 341.03572845458984, "epoch": 3.3720930232558137, "grad_norm": 0.022784234955906868, "kl": 0.04339599609375, "learning_rate": 3.347579325686237e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 482.7321472167969, "epoch": 3.387596899224806, "grad_norm": 15.430904388427734, "kl": 4.7401123046875, "learning_rate": 3.333515915609027e-06, "loss": 0.1214, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 391.1607360839844, "epoch": 3.4031007751937983, "grad_norm": 2.3437180519104004, "kl": 0.5184326171875, "learning_rate": 3.3194280223726616e-06, "loss": 0.0116, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 383.30358123779297, "epoch": 3.4186046511627906, "grad_norm": 1.4211961030960083, "kl": 0.56317138671875, "learning_rate": 3.305316237076927e-06, "loss": 0.0405, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 390.73216247558594, "epoch": 3.434108527131783, "grad_norm": 11.418591499328613, "kl": 1.92529296875, "learning_rate": 3.291181151824071e-06, "loss": 0.0215, "reward": 0.09107143618166447, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 427.0000305175781, "epoch": 3.449612403100775, "grad_norm": 4.976629734039307, "kl": 0.14874267578125, "learning_rate": 3.27702335969396e-06, "loss": 0.0572, "reward": 0.0892857201397419, "reward_std": 0.015152288600802422, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 427.9285888671875, "epoch": 3.4651162790697674, "grad_norm": 0.43471699953079224, "kl": 0.1776123046875, "learning_rate": 3.2628434547191985e-06, "loss": -0.0052, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 348.7321548461914, "epoch": 3.4806201550387597, "grad_norm": 0.029118988662958145, "kl": 0.04644775390625, "learning_rate": 3.2486420318601973e-06, "loss": 0.0005, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 368.0714416503906, "epoch": 3.496124031007752, "grad_norm": 0.1969047337770462, "kl": 0.06158447265625, "learning_rate": 3.2344196869802187e-06, "loss": 0.0168, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 359.48217010498047, "epoch": 3.511627906976744, "grad_norm": 0.05316058173775673, "kl": 0.04388427734375, "learning_rate": 3.2201770168203694e-06, "loss": 0.0004, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 416.55358123779297, "epoch": 3.5271317829457365, "grad_norm": 1.583228349685669, "kl": 0.6258544921875, "learning_rate": 3.205914618974563e-06, "loss": 0.0457, "reward": 0.08750000596046448, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 396.3214416503906, "epoch": 3.5426356589147288, "grad_norm": 0.8248907923698425, "kl": 0.10003662109375, "learning_rate": 3.1916330918644496e-06, "loss": 0.0307, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 430.0535888671875, "epoch": 3.558139534883721, "grad_norm": 0.37740814685821533, "kl": 0.19354248046875, "learning_rate": 3.177333034714303e-06, "loss": 0.0462, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 360.2321548461914, "epoch": 3.5736434108527133, "grad_norm": 0.2104271948337555, "kl": 0.05908203125, "learning_rate": 3.1630150475258813e-06, "loss": 0.0171, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 482.6964569091797, "epoch": 3.5891472868217056, "grad_norm": 3.0259857177734375, "kl": 2.01220703125, "learning_rate": 3.148679731053252e-06, "loss": 0.0681, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 454.9821548461914, "epoch": 3.604651162790698, "grad_norm": 0.9339037537574768, "kl": 0.580322265625, "learning_rate": 3.1343276867775805e-06, "loss": 0.0536, "reward": 0.08571429178118706, "reward_std": 0.015152288600802422, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 459.0178756713867, "epoch": 3.62015503875969, "grad_norm": 1.466651201248169, "kl": 0.37054443359375, "learning_rate": 3.1199595168819043e-06, "loss": 0.0784, "reward": 0.0892857164144516, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 436.00001525878906, "epoch": 3.6356589147286824, "grad_norm": 5.144193649291992, "kl": 0.2822265625, "learning_rate": 3.105575824225852e-06, "loss": 0.0738, "reward": 0.08750000968575478, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000596046448, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 389.12501525878906, "epoch": 3.6511627906976747, "grad_norm": 2.5096967220306396, "kl": 0.32818603515625, "learning_rate": 3.091177212320363e-06, "loss": 0.0412, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 323.6071548461914, "epoch": 3.6666666666666665, "grad_norm": 0.6911299824714661, "kl": 0.5263671875, "learning_rate": 3.0767642853023538e-06, "loss": -0.0336, "reward": 0.09285715036094189, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714477300644, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 446.2143020629883, "epoch": 3.682170542635659, "grad_norm": 5.997923374176025, "kl": 4.308349609375, "learning_rate": 3.062337647909376e-06, "loss": 0.0867, "reward": 0.0892857201397419, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 362.3214416503906, "epoch": 3.697674418604651, "grad_norm": 4.273003578186035, "kl": 2.62890625, "learning_rate": 3.04789790545424e-06, "loss": 0.0346, "reward": 0.0892857238650322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 436.6250228881836, "epoch": 3.7131782945736433, "grad_norm": 0.8113691210746765, "kl": 1.3873291015625, "learning_rate": 3.033445663799621e-06, "loss": 0.0157, "reward": 0.08750000596046448, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 365.8214454650879, "epoch": 3.7286821705426356, "grad_norm": 6.4786882400512695, "kl": 0.678466796875, "learning_rate": 3.018981529332633e-06, "loss": 0.0158, "reward": 0.0892857201397419, "reward_std": 0.015152289299294353, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 359.2678642272949, "epoch": 3.744186046511628, "grad_norm": 2.5377206802368164, "kl": 1.3226318359375, "learning_rate": 3.00450610893939e-06, "loss": 0.0097, "reward": 0.09107143245637417, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 362.1428680419922, "epoch": 3.75968992248062, "grad_norm": 4.049387454986572, "kl": 0.501953125, "learning_rate": 2.9900200099795396e-06, "loss": 0.0179, "reward": 0.09107143245637417, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 388.17859649658203, "epoch": 3.7751937984496124, "grad_norm": 7.053500175476074, "kl": 4.607421875, "learning_rate": 2.9755238402607826e-06, "loss": 0.056, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 320.6250114440918, "epoch": 3.7906976744186047, "grad_norm": 3.405297040939331, "kl": 3.1016845703125, "learning_rate": 2.961018208013367e-06, "loss": 0.0188, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 316.2143020629883, "epoch": 3.806201550387597, "grad_norm": 0.8333543539047241, "kl": 0.9293212890625, "learning_rate": 2.9465037218645694e-06, "loss": 0.0027, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 415.42859268188477, "epoch": 3.8217054263565893, "grad_norm": 0.9986127018928528, "kl": 2.35595703125, "learning_rate": 2.9319809908131604e-06, "loss": -0.0196, "reward": 0.08392857946455479, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 371.17859649658203, "epoch": 3.8372093023255816, "grad_norm": 0.5152451395988464, "kl": 1.0821533203125, "learning_rate": 2.917450624203847e-06, "loss": 0.0239, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 370.41072845458984, "epoch": 3.8527131782945734, "grad_norm": 0.3054462671279907, "kl": 0.1796875, "learning_rate": 2.9029132317017118e-06, "loss": 0.0193, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 312.46429443359375, "epoch": 3.8682170542635657, "grad_norm": 0.3965552747249603, "kl": 0.6539306640625, "learning_rate": 2.888369423266629e-06, "loss": 0.0087, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 296.3571548461914, "epoch": 3.883720930232558, "grad_norm": 1.3105534315109253, "kl": 0.5010986328125, "learning_rate": 2.8738198091276712e-06, "loss": -0.0163, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 282.8035888671875, "epoch": 3.89922480620155, "grad_norm": 0.4267464876174927, "kl": 0.328857421875, "learning_rate": 2.859264999757509e-06, "loss": 0.0033, "reward": 0.0982142947614193, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 334.2143020629883, "epoch": 3.9147286821705425, "grad_norm": 0.5196983814239502, "kl": 0.22705078125, "learning_rate": 2.8447056058467928e-06, "loss": 0.041, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 390.60717010498047, "epoch": 3.9302325581395348, "grad_norm": 2.4938759803771973, "kl": 2.436279296875, "learning_rate": 2.830142238278531e-06, "loss": 0.0643, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 369.7143020629883, "epoch": 3.945736434108527, "grad_norm": 19.98564910888672, "kl": 3.283447265625, "learning_rate": 2.81557550810246e-06, "loss": 0.1163, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 251.08929824829102, "epoch": 3.9612403100775193, "grad_norm": 0.8202245831489563, "kl": 1.39990234375, "learning_rate": 2.8010060265094026e-06, "loss": -0.0223, "reward": 0.09642857685685158, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 405.80359649658203, "epoch": 3.9767441860465116, "grad_norm": 0.16511620581150055, "kl": 0.1580810546875, "learning_rate": 2.786434404805629e-06, "loss": 0.0429, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 435.65625, "epoch": 3.992248062015504, "grad_norm": 0.6317914724349976, "kl": 1.050048828125, "learning_rate": 2.771861254387199e-06, "loss": 0.0243, "reward": 0.09107143618166447, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 387.25000762939453, "epoch": 4.015503875968992, "grad_norm": 2.031557321548462, "kl": 2.938232421875, "learning_rate": 2.7572871867143204e-06, "loss": 0.0425, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 363.5535888671875, "epoch": 4.0310077519379846, "grad_norm": 0.5578765273094177, "kl": 0.310546875, "learning_rate": 2.742712813285681e-06, "loss": 0.0663, "reward": 0.09285714663565159, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 380.5178756713867, "epoch": 4.046511627906977, "grad_norm": 0.3609902858734131, "kl": 0.613037109375, "learning_rate": 2.7281387456128017e-06, "loss": 0.0103, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 431.25001525878906, "epoch": 4.062015503875969, "grad_norm": 0.13468655943870544, "kl": 0.6370849609375, "learning_rate": 2.7135655951943716e-06, "loss": 0.0617, "reward": 0.09107143431901932, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 424.642879486084, "epoch": 4.077519379844961, "grad_norm": 0.7959390878677368, "kl": 1.122314453125, "learning_rate": 2.698993973490598e-06, "loss": 0.04, "reward": 0.0892857201397419, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 325.5178756713867, "epoch": 4.093023255813954, "grad_norm": 0.541179895401001, "kl": 0.6156005859375, "learning_rate": 2.6844244918975416e-06, "loss": 0.009, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 484.4643096923828, "epoch": 4.108527131782946, "grad_norm": 0.6031014919281006, "kl": 0.8077392578125, "learning_rate": 2.66985776172147e-06, "loss": 0.0724, "reward": 0.08750000596046448, "reward_std": 0.017677670810371637, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 322.8035888671875, "epoch": 4.124031007751938, "grad_norm": 0.9272570610046387, "kl": 0.213623046875, "learning_rate": 2.6552943941532088e-06, "loss": 0.014, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 353.35716247558594, "epoch": 4.1395348837209305, "grad_norm": 0.8877429366111755, "kl": 0.3662109375, "learning_rate": 2.6407350002424927e-06, "loss": 0.0444, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 388.8571548461914, "epoch": 4.155038759689923, "grad_norm": 0.2517595887184143, "kl": 0.248046875, "learning_rate": 2.626180190872329e-06, "loss": 0.0563, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 397.2857360839844, "epoch": 4.170542635658915, "grad_norm": 0.45230913162231445, "kl": 0.7420654296875, "learning_rate": 2.611630576733372e-06, "loss": 0.0146, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 352.67858123779297, "epoch": 4.186046511627907, "grad_norm": 4.2593770027160645, "kl": 4.50927734375, "learning_rate": 2.5970867682982885e-06, "loss": 0.068, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 337.42859268188477, "epoch": 4.2015503875969, "grad_norm": 7.50376033782959, "kl": 10.3663330078125, "learning_rate": 2.582549375796154e-06, "loss": 0.0469, "reward": 0.0892857201397419, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 367.7143135070801, "epoch": 4.217054263565892, "grad_norm": 0.7440657615661621, "kl": 0.711181640625, "learning_rate": 2.568019009186841e-06, "loss": 0.0718, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 337.96429443359375, "epoch": 4.232558139534884, "grad_norm": 0.491629421710968, "kl": 1.0819091796875, "learning_rate": 2.5534962781354317e-06, "loss": 0.039, "reward": 0.09464286454021931, "reward_std": 0.007576144300401211, "rewards/code_reward": 0.0, "rewards/format_reward": 0.946428582072258, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 377.5893020629883, "epoch": 4.248062015503876, "grad_norm": 0.3443954885005951, "kl": 0.227294921875, "learning_rate": 2.538981791986634e-06, "loss": 0.0389, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 383.6428680419922, "epoch": 4.263565891472869, "grad_norm": 1.2458945512771606, "kl": 0.91650390625, "learning_rate": 2.524476159739218e-06, "loss": 0.0354, "reward": 0.08928572200238705, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 354.92858123779297, "epoch": 4.27906976744186, "grad_norm": 1.8443901538848877, "kl": 1.0614013671875, "learning_rate": 2.5099799900204607e-06, "loss": 0.0072, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 440.35716247558594, "epoch": 4.294573643410852, "grad_norm": 0.3539612293243408, "kl": 0.1920166015625, "learning_rate": 2.4954938910606108e-06, "loss": 0.0495, "reward": 0.0892857201397419, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 407.6785888671875, "epoch": 4.310077519379845, "grad_norm": 0.10979936271905899, "kl": 0.1221923828125, "learning_rate": 2.481018470667368e-06, "loss": 0.0797, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 408.23216247558594, "epoch": 4.325581395348837, "grad_norm": 1.0740007162094116, "kl": 1.7060546875, "learning_rate": 2.4665543362003802e-06, "loss": 0.057, "reward": 0.0857142936438322, "reward_std": 0.020203052321448922, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 389.9643020629883, "epoch": 4.341085271317829, "grad_norm": 0.8059707880020142, "kl": 1.093017578125, "learning_rate": 2.4521020945457615e-06, "loss": 0.0484, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 458.00001525878906, "epoch": 4.3565891472868215, "grad_norm": 3.3947207927703857, "kl": 0.936279296875, "learning_rate": 2.4376623520906255e-06, "loss": 0.1075, "reward": 0.08571428991854191, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 359.9821548461914, "epoch": 4.372093023255814, "grad_norm": 0.2123708724975586, "kl": 0.177001953125, "learning_rate": 2.4232357146976478e-06, "loss": 0.0428, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 373.87501525878906, "epoch": 4.387596899224806, "grad_norm": 1.0469727516174316, "kl": 2.60693359375, "learning_rate": 2.408822787679637e-06, "loss": 0.0565, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 300.5714416503906, "epoch": 4.403100775193798, "grad_norm": 0.21965286135673523, "kl": 0.1136474609375, "learning_rate": 2.3944241757741475e-06, "loss": 0.0211, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 497.08931732177734, "epoch": 4.4186046511627906, "grad_norm": 0.4067678451538086, "kl": 1.36474609375, "learning_rate": 2.380040483118097e-06, "loss": 0.1029, "reward": 0.08571429178118706, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 493.3928756713867, "epoch": 4.434108527131783, "grad_norm": 0.29808875918388367, "kl": 0.3673095703125, "learning_rate": 2.365672313222419e-06, "loss": 0.0918, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 578.4643249511719, "epoch": 4.449612403100775, "grad_norm": 0.47009024024009705, "kl": 0.503173828125, "learning_rate": 2.351320268946749e-06, "loss": 0.1227, "reward": 0.08214286155998707, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214285969734192, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 467.6071548461914, "epoch": 4.465116279069767, "grad_norm": 1.2561384439468384, "kl": 0.22412109375, "learning_rate": 2.336984952474119e-06, "loss": 0.067, "reward": 0.08928572200238705, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 370.3214416503906, "epoch": 4.48062015503876, "grad_norm": 0.14166420698165894, "kl": 0.16064453125, "learning_rate": 2.322666965285697e-06, "loss": 0.0425, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 761.8036117553711, "epoch": 4.496124031007752, "grad_norm": 0.33948689699172974, "kl": 0.207275390625, "learning_rate": 2.3083669081355507e-06, "loss": 0.1666, "reward": 0.07321429066359997, "reward_std": 0.027779196621850133, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7321428805589676, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 745.7678985595703, "epoch": 4.511627906976744, "grad_norm": 0.9462884068489075, "kl": 1.53857421875, "learning_rate": 2.2940853810254377e-06, "loss": 0.1113, "reward": 0.06964286044239998, "reward_std": 0.022728433134034276, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6964285969734192, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 822.5893249511719, "epoch": 4.5271317829457365, "grad_norm": 0.6085572838783264, "kl": 0.3189697265625, "learning_rate": 2.2798229831796313e-06, "loss": 0.2599, "reward": 0.07321429066359997, "reward_std": 0.037880722898989916, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7321428954601288, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 993.9286041259766, "epoch": 4.542635658914729, "grad_norm": 0.5242969393730164, "kl": 0.3759765625, "learning_rate": 2.2655803130197816e-06, "loss": 0.1781, "reward": 0.057142860256135464, "reward_std": 0.025253815110772848, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714286118745804, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 886.1607513427734, "epoch": 4.558139534883721, "grad_norm": 0.3978789150714874, "kl": 0.3583984375, "learning_rate": 2.2513579681398034e-06, "loss": 0.188, "reward": 0.06964286044239998, "reward_std": 0.027779195923358202, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6964286118745804, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 694.1607513427734, "epoch": 4.573643410852713, "grad_norm": 1.5433213710784912, "kl": 0.54296875, "learning_rate": 2.237156545280803e-06, "loss": 0.1312, "reward": 0.07857143133878708, "reward_std": 0.020203052321448922, "rewards/code_reward": 0.0, "rewards/format_reward": 0.785714328289032, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 935.6965026855469, "epoch": 4.589147286821706, "grad_norm": 0.7737842798233032, "kl": 0.349609375, "learning_rate": 2.2229766403060403e-06, "loss": 0.1306, "reward": 0.06607143394649029, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.660714328289032, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 854.2143249511719, "epoch": 4.604651162790698, "grad_norm": 0.2729988992214203, "kl": 0.80322265625, "learning_rate": 2.2088188481759305e-06, "loss": 0.1527, "reward": 0.06428571976721287, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6428571790456772, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 559.0178833007812, "epoch": 4.62015503875969, "grad_norm": 0.42360585927963257, "kl": 0.226318359375, "learning_rate": 2.194683762923073e-06, "loss": 0.1256, "reward": 0.08214286342263222, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 557.4107437133789, "epoch": 4.635658914728682, "grad_norm": 1.7277472019195557, "kl": 1.413330078125, "learning_rate": 2.1805719776273387e-06, "loss": 0.1408, "reward": 0.08035714738070965, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714626312256, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 587.2857513427734, "epoch": 4.651162790697675, "grad_norm": 0.5576246976852417, "kl": 0.68017578125, "learning_rate": 2.166484084390974e-06, "loss": 0.1176, "reward": 0.0803571492433548, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 607.6607666015625, "epoch": 4.666666666666667, "grad_norm": 0.4789409339427948, "kl": 0.2098388671875, "learning_rate": 2.1524206743137636e-06, "loss": 0.1325, "reward": 0.08392857946455479, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 360.62500762939453, "epoch": 4.682170542635659, "grad_norm": 0.17774701118469238, "kl": 0.1337890625, "learning_rate": 2.1383823374682287e-06, "loss": 0.0556, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 371.44644927978516, "epoch": 4.6976744186046515, "grad_norm": 29.56537437438965, "kl": 11.70361328125, "learning_rate": 2.124369662874868e-06, "loss": 0.1786, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 487.1785888671875, "epoch": 4.713178294573644, "grad_norm": 141.20318603515625, "kl": 45.4642333984375, "learning_rate": 2.110383238477441e-06, "loss": 0.524, "reward": 0.08571428991854191, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 510.6428756713867, "epoch": 4.728682170542635, "grad_norm": 0.6217575669288635, "kl": 1.52783203125, "learning_rate": 2.096423651118305e-06, "loss": 0.1348, "reward": 0.0857142936438322, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 591.4107360839844, "epoch": 4.7441860465116275, "grad_norm": 0.8732028007507324, "kl": 1.2354736328125, "learning_rate": 2.082491486513788e-06, "loss": 0.1306, "reward": 0.07678571902215481, "reward_std": 0.022728433134034276, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571790456772, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 469.5357437133789, "epoch": 4.75968992248062, "grad_norm": 0.8003130555152893, "kl": 1.0810546875, "learning_rate": 2.0685873292296116e-06, "loss": 0.1155, "reward": 0.08750000409781933, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 462.5000305175781, "epoch": 4.775193798449612, "grad_norm": 4.2348222732543945, "kl": 3.279296875, "learning_rate": 2.054711762656369e-06, "loss": 0.0886, "reward": 0.08571428991854191, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 433.16072845458984, "epoch": 4.790697674418604, "grad_norm": 0.15781785547733307, "kl": 0.1988525390625, "learning_rate": 2.040865368985044e-06, "loss": 0.0994, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 313.6250114440918, "epoch": 4.8062015503875966, "grad_norm": 0.2198803722858429, "kl": 0.2431640625, "learning_rate": 2.027048729182583e-06, "loss": 0.0431, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 382.94644927978516, "epoch": 4.821705426356589, "grad_norm": 0.25655123591423035, "kl": 0.217529296875, "learning_rate": 2.0132624229675205e-06, "loss": 0.0808, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 344.1071548461914, "epoch": 4.837209302325581, "grad_norm": 0.1982196867465973, "kl": 0.136962890625, "learning_rate": 1.9995070287856546e-06, "loss": 0.0639, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 407.67858123779297, "epoch": 4.852713178294573, "grad_norm": 0.3460334241390228, "kl": 0.26708984375, "learning_rate": 1.985783123785774e-06, "loss": 0.0776, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 366.7143020629883, "epoch": 4.868217054263566, "grad_norm": 0.31413528323173523, "kl": 0.273681640625, "learning_rate": 1.9720912837954486e-06, "loss": 0.0568, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 355.3750228881836, "epoch": 4.883720930232558, "grad_norm": 1.5121686458587646, "kl": 0.6146240234375, "learning_rate": 1.958432083296862e-06, "loss": 0.0288, "reward": 0.09285715036094189, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 442.3393096923828, "epoch": 4.89922480620155, "grad_norm": 0.1976187527179718, "kl": 0.233154296875, "learning_rate": 1.9448060954027093e-06, "loss": 0.1026, "reward": 0.08750000596046448, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 450.5357437133789, "epoch": 4.9147286821705425, "grad_norm": 0.8077256083488464, "kl": 0.3204345703125, "learning_rate": 1.931213891832153e-06, "loss": 0.1048, "reward": 0.0892857201397419, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 416.73216247558594, "epoch": 4.930232558139535, "grad_norm": 0.5839415192604065, "kl": 0.266357421875, "learning_rate": 1.9176560428868336e-06, "loss": 0.0852, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 423.80358505249023, "epoch": 4.945736434108527, "grad_norm": 0.23957200348377228, "kl": 0.19775390625, "learning_rate": 1.9041331174269373e-06, "loss": 0.0808, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 543.5000228881836, "epoch": 4.961240310077519, "grad_norm": 0.5168282985687256, "kl": 0.46875, "learning_rate": 1.8906456828473341e-06, "loss": 0.1104, "reward": 0.08571429178118706, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 430.4285888671875, "epoch": 4.976744186046512, "grad_norm": 0.32360389828681946, "kl": 0.426513671875, "learning_rate": 1.8771943050537656e-06, "loss": 0.0953, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 309.8125, "epoch": 4.992248062015504, "grad_norm": 0.24179396033287048, "kl": 0.2115478515625, "learning_rate": 1.8637795484391046e-06, "loss": 0.04, "reward": 0.09285715036094189, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714328289032, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 586.9464569091797, "epoch": 5.015503875968992, "grad_norm": 0.818027675151825, "kl": 0.38525390625, "learning_rate": 1.8504019758596698e-06, "loss": 0.1226, "reward": 0.08392857760190964, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857313156128, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 378.0357246398926, "epoch": 5.0310077519379846, "grad_norm": 0.2399987429380417, "kl": 0.15380859375, "learning_rate": 1.8370621486116163e-06, "loss": 0.0592, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 547.6607513427734, "epoch": 5.046511627906977, "grad_norm": 0.6719325184822083, "kl": 0.46630859375, "learning_rate": 1.823760626407377e-06, "loss": 0.1547, "reward": 0.08571429178118706, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 534.8928833007812, "epoch": 5.062015503875969, "grad_norm": 1.7380086183547974, "kl": 0.52978515625, "learning_rate": 1.8104979673521838e-06, "loss": 0.1056, "reward": 0.08571429178118706, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 476.76788330078125, "epoch": 5.077519379844961, "grad_norm": 0.3788207173347473, "kl": 0.2706298828125, "learning_rate": 1.7972747279206482e-06, "loss": 0.0984, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 497.48216247558594, "epoch": 5.093023255813954, "grad_norm": 0.45556584000587463, "kl": 1.263916015625, "learning_rate": 1.7840914629334122e-06, "loss": 0.1099, "reward": 0.08571428991854191, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 383.6607360839844, "epoch": 5.108527131782946, "grad_norm": 0.08392675966024399, "kl": 0.09619140625, "learning_rate": 1.7709487255338731e-06, "loss": 0.0387, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 438.5357360839844, "epoch": 5.124031007751938, "grad_norm": 0.34529322385787964, "kl": 0.2061767578125, "learning_rate": 1.7578470671649684e-06, "loss": 0.0743, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 467.55359649658203, "epoch": 5.1395348837209305, "grad_norm": 0.25744950771331787, "kl": 0.1669921875, "learning_rate": 1.744787037546045e-06, "loss": 0.0932, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 478.5893096923828, "epoch": 5.155038759689923, "grad_norm": 0.19755351543426514, "kl": 0.215087890625, "learning_rate": 1.731769184649788e-06, "loss": 0.0735, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 520.6428833007812, "epoch": 5.170542635658915, "grad_norm": 0.23117099702358246, "kl": 0.2330322265625, "learning_rate": 1.7187940546792325e-06, "loss": 0.1332, "reward": 0.08750000596046448, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 416.58929443359375, "epoch": 5.186046511627907, "grad_norm": 0.45335835218429565, "kl": 0.361083984375, "learning_rate": 1.7058621920448465e-06, "loss": 0.0598, "reward": 0.09107143431901932, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 446.3035888671875, "epoch": 5.2015503875969, "grad_norm": 0.2482314556837082, "kl": 0.2408447265625, "learning_rate": 1.6929741393416855e-06, "loss": 0.0759, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 463.66073989868164, "epoch": 5.217054263565892, "grad_norm": 0.6526241302490234, "kl": 0.2216796875, "learning_rate": 1.6801304373266286e-06, "loss": 0.0948, "reward": 0.09107143618166447, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 579.6607437133789, "epoch": 5.232558139534884, "grad_norm": 0.7119612693786621, "kl": 0.199951171875, "learning_rate": 1.667331624895689e-06, "loss": 0.1398, "reward": 0.08571429178118706, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 421.7857437133789, "epoch": 5.248062015503876, "grad_norm": 0.12730364501476288, "kl": 0.1505126953125, "learning_rate": 1.6545782390614037e-06, "loss": 0.0542, "reward": 0.09464286640286446, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 558.1428756713867, "epoch": 5.263565891472869, "grad_norm": 0.6476343274116516, "kl": 1.1666259765625, "learning_rate": 1.6418708149302992e-06, "loss": 0.1135, "reward": 0.08392857760190964, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 563.1250305175781, "epoch": 5.27906976744186, "grad_norm": 0.5077939033508301, "kl": 1.557861328125, "learning_rate": 1.6292098856804423e-06, "loss": 0.1212, "reward": 0.08392857760190964, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 542.3036041259766, "epoch": 5.294573643410852, "grad_norm": 0.2610304057598114, "kl": 0.2861328125, "learning_rate": 1.6165959825390661e-06, "loss": 0.0738, "reward": 0.08571429178118706, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 726.9286041259766, "epoch": 5.310077519379845, "grad_norm": 0.8110432624816895, "kl": 0.67041015625, "learning_rate": 1.604029634760284e-06, "loss": 0.2361, "reward": 0.07500000298023224, "reward_std": 0.03030457766726613, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7500000298023224, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 639.2143249511719, "epoch": 5.325581395348837, "grad_norm": 0.5734603404998779, "kl": 0.221923828125, "learning_rate": 1.59151136960288e-06, "loss": 0.1314, "reward": 0.07857143133878708, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7857143133878708, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 612.1607437133789, "epoch": 5.341085271317829, "grad_norm": 0.37599024176597595, "kl": 0.360595703125, "learning_rate": 1.5790417123081903e-06, "loss": 0.1625, "reward": 0.08214286342263222, "reward_std": 0.025253814877942204, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 552.0357360839844, "epoch": 5.3565891472868215, "grad_norm": 0.21845099329948425, "kl": 0.2716064453125, "learning_rate": 1.5666211860780583e-06, "loss": 0.1555, "reward": 0.08571429550647736, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 583.9464416503906, "epoch": 5.372093023255814, "grad_norm": 0.6591483950614929, "kl": 0.310791015625, "learning_rate": 1.5542503120528918e-06, "loss": 0.143, "reward": 0.08571428991854191, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 686.3214569091797, "epoch": 5.387596899224806, "grad_norm": 0.49289828538894653, "kl": 0.2916259765625, "learning_rate": 1.5419296092897866e-06, "loss": 0.1556, "reward": 0.07678572088479996, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571939468384, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 575.5357437133789, "epoch": 5.403100775193798, "grad_norm": 0.5644925832748413, "kl": 1.2939453125, "learning_rate": 1.529659594740755e-06, "loss": 0.1451, "reward": 0.08392857760190964, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857313156128, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 665.3571853637695, "epoch": 5.4186046511627906, "grad_norm": 0.4430839717388153, "kl": 0.51171875, "learning_rate": 1.5174407832310338e-06, "loss": 0.1747, "reward": 0.0803571492433548, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 569.732177734375, "epoch": 5.434108527131783, "grad_norm": 0.48753219842910767, "kl": 0.559326171875, "learning_rate": 1.5052736874374815e-06, "loss": 0.1818, "reward": 0.08214286342263222, "reward_std": 0.025253814877942204, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 604.0714492797852, "epoch": 5.449612403100775, "grad_norm": 1.0126312971115112, "kl": 0.509765625, "learning_rate": 1.4931588178670695e-06, "loss": 0.1626, "reward": 0.08214286155998707, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 641.9643096923828, "epoch": 5.465116279069767, "grad_norm": 0.4418766498565674, "kl": 0.4609375, "learning_rate": 1.4810966828354605e-06, "loss": 0.2133, "reward": 0.08035714738070965, "reward_std": 0.027779195923358202, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 672.5178833007812, "epoch": 5.48062015503876, "grad_norm": 0.30397123098373413, "kl": 0.34521484375, "learning_rate": 1.469087788445684e-06, "loss": 0.1997, "reward": 0.0803571492433548, "reward_std": 0.02777919638901949, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 607.7678985595703, "epoch": 5.496124031007752, "grad_norm": 0.5019213557243347, "kl": 0.195068359375, "learning_rate": 1.4571326385668965e-06, "loss": 0.1518, "reward": 0.08392857946455479, "reward_std": 0.022728433832526207, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 552.5000228881836, "epoch": 5.511627906976744, "grad_norm": 1.0830461978912354, "kl": 2.527099609375, "learning_rate": 1.4452317348132434e-06, "loss": 0.1191, "reward": 0.08214286155998707, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 604.357177734375, "epoch": 5.5271317829457365, "grad_norm": 1.086542010307312, "kl": 0.6982421875, "learning_rate": 1.4333855765228104e-06, "loss": 0.1066, "reward": 0.0803571492433548, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 553.1428756713867, "epoch": 5.542635658914729, "grad_norm": 0.7319161891937256, "kl": 0.232177734375, "learning_rate": 1.421594660736675e-06, "loss": 0.1141, "reward": 0.08571429178118706, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 666.678596496582, "epoch": 5.558139534883721, "grad_norm": 0.21619383990764618, "kl": 0.128662109375, "learning_rate": 1.4098594821780476e-06, "loss": 0.1202, "reward": 0.08035714738070965, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 540.1428680419922, "epoch": 5.573643410852713, "grad_norm": 0.1556929498910904, "kl": 0.206787109375, "learning_rate": 1.3981805332315174e-06, "loss": 0.1111, "reward": 0.08571429178118706, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 496.2857322692871, "epoch": 5.589147286821706, "grad_norm": 0.15184734761714935, "kl": 0.169921875, "learning_rate": 1.3865583039223929e-06, "loss": 0.0747, "reward": 0.08928572200238705, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 474.26788330078125, "epoch": 5.604651162790698, "grad_norm": 0.7148901224136353, "kl": 1.36328125, "learning_rate": 1.374993281896137e-06, "loss": 0.0816, "reward": 0.0892857238650322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 391.1607360839844, "epoch": 5.62015503875969, "grad_norm": 0.2310057431459427, "kl": 0.188720703125, "learning_rate": 1.3634859523979134e-06, "loss": 0.0509, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 537.8393020629883, "epoch": 5.635658914728682, "grad_norm": 0.5635959506034851, "kl": 2.5693359375, "learning_rate": 1.3520367982522208e-06, "loss": 0.0846, "reward": 0.08214286342263222, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286267757416, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 483.4643096923828, "epoch": 5.651162790697675, "grad_norm": 0.26445066928863525, "kl": 0.192626953125, "learning_rate": 1.3406462998426358e-06, "loss": 0.0337, "reward": 0.0892857201397419, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 530.2143173217773, "epoch": 5.666666666666667, "grad_norm": 0.4850466847419739, "kl": 0.1839599609375, "learning_rate": 1.3293149350916595e-06, "loss": 0.1534, "reward": 0.0857142936438322, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 407.9464569091797, "epoch": 5.682170542635659, "grad_norm": 0.11648667603731155, "kl": 0.1539306640625, "learning_rate": 1.3180431794406623e-06, "loss": 0.0207, "reward": 0.09464286454021931, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 437.0714416503906, "epoch": 5.6976744186046515, "grad_norm": 0.2448117733001709, "kl": 0.2156982421875, "learning_rate": 1.3068315058299358e-06, "loss": 0.0762, "reward": 0.09285715036094189, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714477300644, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 483.6607360839844, "epoch": 5.713178294573644, "grad_norm": 0.6701087951660156, "kl": 0.1866455078125, "learning_rate": 1.2956803846788503e-06, "loss": 0.0674, "reward": 0.08750000782310963, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 504.6250305175781, "epoch": 5.728682170542635, "grad_norm": 0.8088942766189575, "kl": 0.3497314453125, "learning_rate": 1.284590283866116e-06, "loss": 0.1427, "reward": 0.08750000409781933, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 677.2321624755859, "epoch": 5.7441860465116275, "grad_norm": 0.19237665832042694, "kl": 0.2906494140625, "learning_rate": 1.2735616687101518e-06, "loss": 0.1596, "reward": 0.07678571715950966, "reward_std": 0.022728433134034276, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571790456772, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 480.80359649658203, "epoch": 5.75968992248062, "grad_norm": 1.9329370260238647, "kl": 1.5391845703125, "learning_rate": 1.2625950019495614e-06, "loss": 0.1103, "reward": 0.08750000409781933, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 662.2143096923828, "epoch": 5.775193798449612, "grad_norm": 0.368585467338562, "kl": 0.1148681640625, "learning_rate": 1.251690743723718e-06, "loss": 0.1846, "reward": 0.0803571492433548, "reward_std": 0.027779196621850133, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714626312256, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 546.0000152587891, "epoch": 5.790697674418604, "grad_norm": 0.2216712236404419, "kl": 0.20556640625, "learning_rate": 1.2408493515534581e-06, "loss": 0.0727, "reward": 0.08571429178118706, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 663.9643173217773, "epoch": 5.8062015503875966, "grad_norm": 0.8497912883758545, "kl": 0.2208251953125, "learning_rate": 1.2300712803218834e-06, "loss": 0.182, "reward": 0.07500000484287739, "reward_std": 0.03030457766726613, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7500000447034836, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 643.928596496582, "epoch": 5.821705426356589, "grad_norm": 0.9936148524284363, "kl": 1.080078125, "learning_rate": 1.2193569822552772e-06, "loss": 0.1875, "reward": 0.07857143506407738, "reward_std": 0.02525381464511156, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7857143431901932, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 744.1250610351562, "epoch": 5.837209302325581, "grad_norm": 0.7761398553848267, "kl": 1.13818359375, "learning_rate": 1.2087069069041268e-06, "loss": 0.1931, "reward": 0.07321429066359997, "reward_std": 0.02777919638901949, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7321428954601288, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 729.5357360839844, "epoch": 5.852713178294573, "grad_norm": 0.48130375146865845, "kl": 0.517333984375, "learning_rate": 1.1981215011242654e-06, "loss": 0.1987, "reward": 0.07500000670552254, "reward_std": 0.025253815343603492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7500000447034836, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 743.2857513427734, "epoch": 5.868217054263566, "grad_norm": 0.4081970155239105, "kl": 1.74365234375, "learning_rate": 1.1876012090581184e-06, "loss": 0.2046, "reward": 0.0714285746216774, "reward_std": 0.030304577900096774, "rewards/code_reward": 0.0, "rewards/format_reward": 0.714285746216774, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 889.1964721679688, "epoch": 5.883720930232558, "grad_norm": 0.2974870502948761, "kl": 0.357666015625, "learning_rate": 1.177146472116071e-06, "loss": 0.1796, "reward": 0.06607143208384514, "reward_std": 0.022728433832526207, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6607143133878708, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 800.3214721679688, "epoch": 5.89922480620155, "grad_norm": 1.8590128421783447, "kl": 2.197265625, "learning_rate": 1.1667577289579462e-06, "loss": 0.2563, "reward": 0.06785714626312256, "reward_std": 0.0353553406894207, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6785714775323868, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 828.8214721679688, "epoch": 5.9147286821705425, "grad_norm": 1.9380897283554077, "kl": 1.0203857421875, "learning_rate": 1.1564354154746007e-06, "loss": 0.183, "reward": 0.06964286044239998, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.6964285969734192, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 502.98217391967773, "epoch": 5.930232558139535, "grad_norm": 0.8601269125938416, "kl": 2.1402587890625, "learning_rate": 1.146179964769635e-06, "loss": 0.0414, "reward": 0.08392857760190964, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.839285746216774, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 557.3393096923828, "epoch": 5.945736434108527, "grad_norm": 0.22515276074409485, "kl": 0.217041015625, "learning_rate": 1.1359918071412195e-06, "loss": 0.097, "reward": 0.08392857573926449, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 464.10717010498047, "epoch": 5.961240310077519, "grad_norm": 1.7024885416030884, "kl": 1.64404296875, "learning_rate": 1.1258713700640456e-06, "loss": 0.0753, "reward": 0.08750000596046448, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 425.8035888671875, "epoch": 5.976744186046512, "grad_norm": 0.17091427743434906, "kl": 0.12548828125, "learning_rate": 1.115819078171383e-06, "loss": 0.0727, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 580.96875, "epoch": 5.992248062015504, "grad_norm": 0.2202906310558319, "kl": 0.183349609375, "learning_rate": 1.1058353532372667e-06, "loss": 0.1451, "reward": 0.08571429178118706, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 501.57144927978516, "epoch": 6.015503875968992, "grad_norm": 0.42733341455459595, "kl": 0.106689453125, "learning_rate": 1.0959206141587998e-06, "loss": 0.1143, "reward": 0.08928571827709675, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 477.94644927978516, "epoch": 6.0310077519379846, "grad_norm": 0.15901905298233032, "kl": 0.1177978515625, "learning_rate": 1.0860752769385766e-06, "loss": 0.0928, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 368.4285888671875, "epoch": 6.046511627906977, "grad_norm": 0.8356048464775085, "kl": 0.5009765625, "learning_rate": 1.0762997546672279e-06, "loss": 0.0306, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 537.2321701049805, "epoch": 6.062015503875969, "grad_norm": 0.337862104177475, "kl": 0.1614990234375, "learning_rate": 1.0665944575060914e-06, "loss": 0.1243, "reward": 0.08750000782310963, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 498.9821548461914, "epoch": 6.077519379844961, "grad_norm": 1.3160614967346191, "kl": 1.275390625, "learning_rate": 1.056959792669997e-06, "loss": 0.1232, "reward": 0.08392857760190964, "reward_std": 0.022728433134034276, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 455.35716247558594, "epoch": 6.093023255813954, "grad_norm": 0.6817833185195923, "kl": 0.56591796875, "learning_rate": 1.0473961644101856e-06, "loss": 0.0979, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 460.05359649658203, "epoch": 6.108527131782946, "grad_norm": 0.9967363476753235, "kl": 0.3055419921875, "learning_rate": 1.037903973997345e-06, "loss": 0.0846, "reward": 0.08750000782310963, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 567.803596496582, "epoch": 6.124031007751938, "grad_norm": 0.22702349722385406, "kl": 0.1322021484375, "learning_rate": 1.0284836197047737e-06, "loss": 0.1403, "reward": 0.0857142936438322, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 586.8393020629883, "epoch": 6.1395348837209305, "grad_norm": 0.20280596613883972, "kl": 0.182373046875, "learning_rate": 1.0191354967916712e-06, "loss": 0.0784, "reward": 0.08214286528527737, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286267757416, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 549.9107360839844, "epoch": 6.155038759689923, "grad_norm": 0.27999112010002136, "kl": 0.1822509765625, "learning_rate": 1.0098599974865515e-06, "loss": 0.1597, "reward": 0.0857142936438322, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 518.0893020629883, "epoch": 6.170542635658915, "grad_norm": 0.2629588842391968, "kl": 0.184814453125, "learning_rate": 1.0006575109707898e-06, "loss": 0.0787, "reward": 0.0892857201397419, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 547.4107360839844, "epoch": 6.186046511627907, "grad_norm": 0.5427741408348083, "kl": 0.40625, "learning_rate": 9.915284233622877e-07, "loss": 0.1505, "reward": 0.08392857760190964, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 537.7500228881836, "epoch": 6.2015503875969, "grad_norm": 0.4367233216762543, "kl": 0.438232421875, "learning_rate": 9.824731176992796e-07, "loss": 0.1517, "reward": 0.08571429178118706, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 617.9821701049805, "epoch": 6.217054263565892, "grad_norm": 0.336049884557724, "kl": 0.306640625, "learning_rate": 9.734919739242543e-07, "loss": 0.1765, "reward": 0.0803571492433548, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 599.1964569091797, "epoch": 6.232558139534884, "grad_norm": 0.5920295119285583, "kl": 0.416259765625, "learning_rate": 9.645853688680177e-07, "loss": 0.1458, "reward": 0.08214286528527737, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286267757416, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 540.6428833007812, "epoch": 6.248062015503876, "grad_norm": 0.3487648665904999, "kl": 0.394775390625, "learning_rate": 9.557536762338786e-07, "loss": 0.112, "reward": 0.08750000596046448, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 528.6071701049805, "epoch": 6.263565891472869, "grad_norm": 0.507084846496582, "kl": 0.4010009765625, "learning_rate": 9.46997266581973e-07, "loss": 0.1125, "reward": 0.08571429178118706, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 561.0000305175781, "epoch": 6.27906976744186, "grad_norm": 0.4561972916126251, "kl": 0.3779296875, "learning_rate": 9.383165073137115e-07, "loss": 0.0993, "reward": 0.08392857946455479, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 429.57144927978516, "epoch": 6.294573643410852, "grad_norm": 0.48230594396591187, "kl": 0.860107421875, "learning_rate": 9.297117626563687e-07, "loss": 0.0911, "reward": 0.08928572200238705, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 590.428581237793, "epoch": 6.310077519379845, "grad_norm": 0.8593739867210388, "kl": 2.3194580078125, "learning_rate": 9.211833936477957e-07, "loss": 0.1887, "reward": 0.07857143506407738, "reward_std": 0.03030457836575806, "rewards/code_reward": 0.0, "rewards/format_reward": 0.785714328289032, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 448.3393096923828, "epoch": 6.325581395348837, "grad_norm": 0.32057252526283264, "kl": 0.25048828125, "learning_rate": 9.127317581212753e-07, "loss": 0.0977, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 484.67859268188477, "epoch": 6.341085271317829, "grad_norm": 0.534175455570221, "kl": 0.37890625, "learning_rate": 9.043572106905084e-07, "loss": 0.1328, "reward": 0.08750000782310963, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 470.5535888671875, "epoch": 6.3565891472868215, "grad_norm": 0.475492924451828, "kl": 1.1109619140625, "learning_rate": 8.960601027347321e-07, "loss": 0.1104, "reward": 0.08750000409781933, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 570.3214416503906, "epoch": 6.372093023255814, "grad_norm": 0.6804907917976379, "kl": 0.504150390625, "learning_rate": 8.878407823839788e-07, "loss": 0.1259, "reward": 0.08392857946455479, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 520.8928833007812, "epoch": 6.387596899224806, "grad_norm": 0.31301963329315186, "kl": 0.2147216796875, "learning_rate": 8.796995945044689e-07, "loss": 0.0639, "reward": 0.08750000782310963, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 400.62500762939453, "epoch": 6.403100775193798, "grad_norm": 0.1473621129989624, "kl": 0.173095703125, "learning_rate": 8.716368806841405e-07, "loss": 0.0833, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 459.0714569091797, "epoch": 6.4186046511627906, "grad_norm": 0.17894765734672546, "kl": 0.1246337890625, "learning_rate": 8.636529792183171e-07, "loss": 0.074, "reward": 0.0892857201397419, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 485.55359649658203, "epoch": 6.434108527131783, "grad_norm": 0.17360633611679077, "kl": 0.1624755859375, "learning_rate": 8.557482250955144e-07, "loss": 0.1076, "reward": 0.0892857201397419, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 536.5893173217773, "epoch": 6.449612403100775, "grad_norm": 1.2795969247817993, "kl": 0.708984375, "learning_rate": 8.479229499833844e-07, "loss": 0.1615, "reward": 0.0857142936438322, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 508.01788330078125, "epoch": 6.465116279069767, "grad_norm": 0.28875473141670227, "kl": 0.2587890625, "learning_rate": 8.401774822147976e-07, "loss": 0.1362, "reward": 0.08750000782310963, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 560.0178909301758, "epoch": 6.48062015503876, "grad_norm": 0.5606586337089539, "kl": 0.267333984375, "learning_rate": 8.325121467740695e-07, "loss": 0.1107, "reward": 0.08571428991854191, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 530.4643096923828, "epoch": 6.496124031007752, "grad_norm": 0.5019002556800842, "kl": 1.6973876953125, "learning_rate": 8.249272652833226e-07, "loss": 0.0951, "reward": 0.08571429550647736, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 549.6250228881836, "epoch": 6.511627906976744, "grad_norm": 0.23863928020000458, "kl": 0.21630859375, "learning_rate": 8.174231559889931e-07, "loss": 0.1364, "reward": 0.08392857387661934, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.839285746216774, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 572.4285888671875, "epoch": 6.5271317829457365, "grad_norm": 0.3975156247615814, "kl": 1.265869140625, "learning_rate": 8.100001337484787e-07, "loss": 0.1362, "reward": 0.08214285969734192, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 455.23216247558594, "epoch": 6.542635658914729, "grad_norm": 0.2875515818595886, "kl": 0.726806640625, "learning_rate": 8.026585100169251e-07, "loss": 0.0998, "reward": 0.09107143431901932, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 564.5714492797852, "epoch": 6.558139534883721, "grad_norm": 0.4496704339981079, "kl": 0.2958984375, "learning_rate": 7.953985928341601e-07, "loss": 0.1502, "reward": 0.0857142936438322, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 558.9643096923828, "epoch": 6.573643410852713, "grad_norm": 0.5100883841514587, "kl": 1.135009765625, "learning_rate": 7.882206868117693e-07, "loss": 0.0882, "reward": 0.08214286342263222, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 636.2143096923828, "epoch": 6.589147286821706, "grad_norm": 0.7258642315864563, "kl": 1.04638671875, "learning_rate": 7.81125093120313e-07, "loss": 0.1719, "reward": 0.07857143506407738, "reward_std": 0.025253815343603492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7857143431901932, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 622.4464492797852, "epoch": 6.604651162790698, "grad_norm": 0.47343626618385315, "kl": 0.2442626953125, "learning_rate": 7.741121094766916e-07, "loss": 0.166, "reward": 0.08035714738070965, "reward_std": 0.022728433134034276, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714775323868, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 628.3928833007812, "epoch": 6.62015503875969, "grad_norm": 0.1919441670179367, "kl": 0.190673828125, "learning_rate": 7.671820301316532e-07, "loss": 0.1353, "reward": 0.08035714738070965, "reward_std": 0.01767767034471035, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714626312256, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 513.9107246398926, "epoch": 6.635658914728682, "grad_norm": 0.5142886638641357, "kl": 1.94091796875, "learning_rate": 7.603351458574474e-07, "loss": 0.1437, "reward": 0.08035714738070965, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714477300644, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 474.89288330078125, "epoch": 6.651162790697675, "grad_norm": 0.16202746331691742, "kl": 0.130859375, "learning_rate": 7.535717439356255e-07, "loss": 0.0393, "reward": 0.0892857201397419, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 627.6785888671875, "epoch": 6.666666666666667, "grad_norm": 0.4151498079299927, "kl": 0.4073486328125, "learning_rate": 7.46892108144986e-07, "loss": 0.1021, "reward": 0.08035714738070965, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8035714626312256, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 461.4107322692871, "epoch": 6.682170542635659, "grad_norm": 0.9489464163780212, "kl": 2.072998046875, "learning_rate": 7.402965187496697e-07, "loss": 0.0601, "reward": 0.08750000968575478, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000596046448, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 580.928596496582, "epoch": 6.6976744186046515, "grad_norm": 0.6491573452949524, "kl": 0.584716796875, "learning_rate": 7.337852524873974e-07, "loss": 0.0931, "reward": 0.08392857760190964, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.839285746216774, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 533.8035888671875, "epoch": 6.713178294573644, "grad_norm": 0.38969844579696655, "kl": 0.188232421875, "learning_rate": 7.273585825578608e-07, "loss": 0.1114, "reward": 0.0857142936438322, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 692.3036041259766, "epoch": 6.728682170542635, "grad_norm": 0.8381134867668152, "kl": 0.2666015625, "learning_rate": 7.21016778611259e-07, "loss": 0.2135, "reward": 0.07678571902215481, "reward_std": 0.027779196621850133, "rewards/code_reward": 0.0, "rewards/format_reward": 0.767857164144516, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 510.71431732177734, "epoch": 6.7441860465116275, "grad_norm": 0.562402069568634, "kl": 0.271484375, "learning_rate": 7.147601067369835e-07, "loss": 0.0545, "reward": 0.08571429178118706, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 550.9821624755859, "epoch": 6.75968992248062, "grad_norm": 0.7675048112869263, "kl": 0.66845703125, "learning_rate": 7.085888294524561e-07, "loss": 0.122, "reward": 0.0857142936438322, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 569.0000305175781, "epoch": 6.775193798449612, "grad_norm": 0.7039144039154053, "kl": 0.5224609375, "learning_rate": 7.025032056921117e-07, "loss": 0.1336, "reward": 0.08392857573926449, "reward_std": 0.022728432901203632, "rewards/code_reward": 0.0, "rewards/format_reward": 0.839285746216774, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 644.6428833007812, "epoch": 6.790697674418604, "grad_norm": 0.642671525478363, "kl": 0.45849609375, "learning_rate": 6.965034907965349e-07, "loss": 0.2222, "reward": 0.07678571902215481, "reward_std": 0.03282995941117406, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571939468384, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 651.3928756713867, "epoch": 6.8062015503875966, "grad_norm": 0.9575018286705017, "kl": 1.3515625, "learning_rate": 6.905899365017462e-07, "loss": 0.1341, "reward": 0.07678571902215481, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571790456772, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 517.9285888671875, "epoch": 6.821705426356589, "grad_norm": 1.0209612846374512, "kl": 0.86376953125, "learning_rate": 6.847627909286409e-07, "loss": 0.1229, "reward": 0.0857142936438322, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 689.7500457763672, "epoch": 6.837209302325581, "grad_norm": 0.41212135553359985, "kl": 0.3935546875, "learning_rate": 6.790222985725761e-07, "loss": 0.2152, "reward": 0.07678571902215481, "reward_std": 0.02777919638901949, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7678571790456772, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 658.1607437133789, "epoch": 6.852713178294573, "grad_norm": 0.4613673686981201, "kl": 0.466552734375, "learning_rate": 6.733687002931141e-07, "loss": 0.1572, "reward": 0.07857143320143223, "reward_std": 0.02020305162295699, "rewards/code_reward": 0.0, "rewards/format_reward": 0.785714328289032, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 762.0893096923828, "epoch": 6.868217054263566, "grad_norm": 0.6260417103767395, "kl": 0.501708984375, "learning_rate": 6.678022333039158e-07, "loss": 0.2349, "reward": 0.0714285746216774, "reward_std": 0.0353553406894207, "rewards/code_reward": 0.0, "rewards/format_reward": 0.714285746216774, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 552.803596496582, "epoch": 6.883720930232558, "grad_norm": 0.4210367798805237, "kl": 0.265625, "learning_rate": 6.623231311627876e-07, "loss": 0.1448, "reward": 0.08392857573926449, "reward_std": 0.017677670111879706, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 562.7500228881836, "epoch": 6.89922480620155, "grad_norm": 0.2927769124507904, "kl": 0.3931884765625, "learning_rate": 6.569316237618811e-07, "loss": 0.1344, "reward": 0.08392857760190964, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 424.03573989868164, "epoch": 6.9147286821705425, "grad_norm": 0.2194661796092987, "kl": 0.30908203125, "learning_rate": 6.516279373180499e-07, "loss": 0.0665, "reward": 0.09107143618166447, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107142984867096, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 527.428596496582, "epoch": 6.930232558139535, "grad_norm": 0.3145492672920227, "kl": 0.4027099609375, "learning_rate": 6.464122943633543e-07, "loss": 0.1228, "reward": 0.0857142936438322, "reward_std": 0.02020305208861828, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 350.0178756713867, "epoch": 6.945736434108527, "grad_norm": 0.3451617658138275, "kl": 0.1922607421875, "learning_rate": 6.412849137357271e-07, "loss": 0.0394, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 512.9821701049805, "epoch": 6.961240310077519, "grad_norm": 0.6971478462219238, "kl": 0.8720703125, "learning_rate": 6.3624601056979e-07, "loss": 0.0821, "reward": 0.08571429178118706, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 373.4464416503906, "epoch": 6.976744186046512, "grad_norm": 0.2429589033126831, "kl": 0.1673583984375, "learning_rate": 6.312957962878278e-07, "loss": 0.0339, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 419.25, "epoch": 6.992248062015504, "grad_norm": 0.7322734594345093, "kl": 0.724365234375, "learning_rate": 6.264344785909181e-07, "loss": 0.0543, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 322.08929443359375, "epoch": 7.015503875968992, "grad_norm": 0.11766334623098373, "kl": 0.0931396484375, "learning_rate": 6.216622614502149e-07, "loss": 0.0416, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 468.3035888671875, "epoch": 7.0310077519379846, "grad_norm": 0.3541572093963623, "kl": 0.212646484375, "learning_rate": 6.169793450983916e-07, "loss": 0.0952, "reward": 0.09107143431901932, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 455.50001525878906, "epoch": 7.046511627906977, "grad_norm": 0.1937684863805771, "kl": 0.177001953125, "learning_rate": 6.123859260212393e-07, "loss": 0.1133, "reward": 0.08928572200238705, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 467.4107360839844, "epoch": 7.062015503875969, "grad_norm": 0.21939167380332947, "kl": 0.1229248046875, "learning_rate": 6.07882196949423e-07, "loss": 0.0914, "reward": 0.09107143431901932, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107142984867096, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 455.4821548461914, "epoch": 7.077519379844961, "grad_norm": 0.30737853050231934, "kl": 0.302490234375, "learning_rate": 6.034683468503948e-07, "loss": 0.0882, "reward": 0.08928572200238705, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 368.7321662902832, "epoch": 7.093023255813954, "grad_norm": 1.252065896987915, "kl": 0.3232421875, "learning_rate": 5.991445609204641e-07, "loss": 0.0627, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 405.5893020629883, "epoch": 7.108527131782946, "grad_norm": 0.13495229184627533, "kl": 0.1036376953125, "learning_rate": 5.949110205770292e-07, "loss": 0.0504, "reward": 0.09464286267757416, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 346.8928680419922, "epoch": 7.124031007751938, "grad_norm": 0.15541866421699524, "kl": 0.1348876953125, "learning_rate": 5.90767903450964e-07, "loss": 0.0409, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 438.0000228881836, "epoch": 7.1395348837209305, "grad_norm": 0.16461335122585297, "kl": 0.1226806640625, "learning_rate": 5.867153833791652e-07, "loss": 0.0759, "reward": 0.09285714849829674, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 328.66072845458984, "epoch": 7.155038759689923, "grad_norm": 0.08290436118841171, "kl": 0.1099853515625, "learning_rate": 5.827536303972587e-07, "loss": 0.0361, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 416.75001525878906, "epoch": 7.170542635658915, "grad_norm": 0.4567578434944153, "kl": 1.5322265625, "learning_rate": 5.78882810732465e-07, "loss": 0.0584, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 314.1607246398926, "epoch": 7.186046511627907, "grad_norm": 0.949168860912323, "kl": 0.59814453125, "learning_rate": 5.75103086796625e-07, "loss": 0.0286, "reward": 0.09642858058214188, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 327.0893020629883, "epoch": 7.2015503875969, "grad_norm": 0.5412436127662659, "kl": 0.7969970703125, "learning_rate": 5.714146171793846e-07, "loss": 0.0063, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 412.1785888671875, "epoch": 7.217054263565892, "grad_norm": 0.24841195344924927, "kl": 0.1400146484375, "learning_rate": 5.678175566415422e-07, "loss": 0.0389, "reward": 0.09285714849829674, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 481.60717010498047, "epoch": 7.232558139534884, "grad_norm": 1.5795103311538696, "kl": 0.748291015625, "learning_rate": 5.643120561085528e-07, "loss": 0.0451, "reward": 0.08750000596046448, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 483.3393020629883, "epoch": 7.248062015503876, "grad_norm": 0.3128277361392975, "kl": 0.57806396484375, "learning_rate": 5.608982626641991e-07, "loss": 0.0626, "reward": 0.08750000782310963, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000298023224, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 388.3750228881836, "epoch": 7.263565891472869, "grad_norm": 0.8693004250526428, "kl": 1.3035888671875, "learning_rate": 5.575763195444166e-07, "loss": 0.0332, "reward": 0.09285715222358704, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 541.7500305175781, "epoch": 7.27906976744186, "grad_norm": 0.2898370027542114, "kl": 0.269287109375, "learning_rate": 5.543463661312847e-07, "loss": 0.1174, "reward": 0.0857142936438322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 366.8393020629883, "epoch": 7.294573643410852, "grad_norm": 0.2669702172279358, "kl": 0.2113037109375, "learning_rate": 5.512085379471808e-07, "loss": 0.0394, "reward": 0.09642857871949673, "reward_std": 0.00505076302215457, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9642857313156128, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 462.58931732177734, "epoch": 7.310077519379845, "grad_norm": 1.6552106142044067, "kl": 0.7685546875, "learning_rate": 5.481629666490903e-07, "loss": 0.1032, "reward": 0.08928572200238705, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 540.9643020629883, "epoch": 7.325581395348837, "grad_norm": 0.23821362853050232, "kl": 0.270263671875, "learning_rate": 5.452097800230853e-07, "loss": 0.1599, "reward": 0.08571429178118706, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 437.1071548461914, "epoch": 7.341085271317829, "grad_norm": 0.28896719217300415, "kl": 1.3438720703125, "learning_rate": 5.423491019789623e-07, "loss": 0.0397, "reward": 0.08928572200238705, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 386.44644927978516, "epoch": 7.3565891472868215, "grad_norm": 0.4218304455280304, "kl": 0.3291015625, "learning_rate": 5.395810525450425e-07, "loss": 0.0623, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 533.1428909301758, "epoch": 7.372093023255814, "grad_norm": 0.2983189821243286, "kl": 0.1497802734375, "learning_rate": 5.369057478631359e-07, "loss": 0.153, "reward": 0.0857142936438322, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 502.2678756713867, "epoch": 7.387596899224806, "grad_norm": 0.3067476451396942, "kl": 0.297119140625, "learning_rate": 5.343233001836694e-07, "loss": 0.0749, "reward": 0.0892857201397419, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 615.8928756713867, "epoch": 7.403100775193798, "grad_norm": 0.3363507390022278, "kl": 0.287353515625, "learning_rate": 5.318338178609754e-07, "loss": 0.1077, "reward": 0.08214286342263222, "reward_std": 0.015152288833633065, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8214286118745804, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 595.1785888671875, "epoch": 7.4186046511627906, "grad_norm": 0.17296263575553894, "kl": 0.2437744140625, "learning_rate": 5.294374053487459e-07, "loss": 0.0975, "reward": 0.08392857760190964, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857313156128, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 464.4107360839844, "epoch": 7.434108527131783, "grad_norm": 1.2098743915557861, "kl": 0.6441650390625, "learning_rate": 5.271341631956511e-07, "loss": 0.0544, "reward": 0.08928572200238705, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.892857164144516, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 355.25001525878906, "epoch": 7.449612403100775, "grad_norm": 0.0816921591758728, "kl": 0.100341796875, "learning_rate": 5.249241880411181e-07, "loss": 0.0204, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 451.66072845458984, "epoch": 7.465116279069767, "grad_norm": 0.1864636242389679, "kl": 0.10595703125, "learning_rate": 5.228075726112785e-07, "loss": 0.0685, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 518.3571624755859, "epoch": 7.48062015503876, "grad_norm": 0.27271780371665955, "kl": 0.115478515625, "learning_rate": 5.207844057150768e-07, "loss": 0.0914, "reward": 0.08750000782310963, "reward_std": 0.01262690732255578, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000447034836, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 502.76788330078125, "epoch": 7.496124031007752, "grad_norm": 0.25202253460884094, "kl": 0.2041015625, "learning_rate": 5.188547722405437e-07, "loss": 0.1187, "reward": 0.0892857238650322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 470.0893020629883, "epoch": 7.511627906976744, "grad_norm": 0.33710911870002747, "kl": 1.2530517578125, "learning_rate": 5.170187531512351e-07, "loss": 0.0767, "reward": 0.0892857238650322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 499.73216247558594, "epoch": 7.5271317829457365, "grad_norm": 0.12209226191043854, "kl": 0.141845703125, "learning_rate": 5.152764254828348e-07, "loss": 0.0716, "reward": 0.08928572200238705, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 456.92859649658203, "epoch": 7.542635658914729, "grad_norm": 0.266292005777359, "kl": 0.1436767578125, "learning_rate": 5.136278623399225e-07, "loss": 0.0962, "reward": 0.09107143618166447, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 333.2678756713867, "epoch": 7.558139534883721, "grad_norm": 0.7162752747535706, "kl": 0.4906005859375, "learning_rate": 5.120731328929058e-07, "loss": 0.0237, "reward": 0.09821429289877415, "reward_std": 0.002525381511077285, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9821428656578064, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 456.1250228881836, "epoch": 7.573643410852713, "grad_norm": 0.4401909410953522, "kl": 1.7235107421875, "learning_rate": 5.106123023751187e-07, "loss": 0.0754, "reward": 0.0892857238650322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 482.6250228881836, "epoch": 7.589147286821706, "grad_norm": 0.48811131715774536, "kl": 0.47265625, "learning_rate": 5.092454320800833e-07, "loss": 0.1142, "reward": 0.0892857201397419, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571939468384, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 517.1250228881836, "epoch": 7.604651162790698, "grad_norm": 0.28417670726776123, "kl": 0.2467041015625, "learning_rate": 5.079725793589405e-07, "loss": 0.1384, "reward": 0.08750000968575478, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000596046448, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 587.6964569091797, "epoch": 7.62015503875969, "grad_norm": 0.27875715494155884, "kl": 0.3658447265625, "learning_rate": 5.067937976180407e-07, "loss": 0.1719, "reward": 0.08392857946455479, "reward_std": 0.02272843336686492, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857760190964, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 524.5893096923828, "epoch": 7.635658914728682, "grad_norm": 0.35742247104644775, "kl": 0.27734375, "learning_rate": 5.057091363167046e-07, "loss": 0.0743, "reward": 0.08928572200238705, "reward_std": 0.010101525811478496, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 457.9285888671875, "epoch": 7.651162790697675, "grad_norm": 0.6690515875816345, "kl": 0.390625, "learning_rate": 5.047186409651489e-07, "loss": 0.086, "reward": 0.09107143804430962, "reward_std": 0.012626907555386424, "rewards/code_reward": 0.0, "rewards/format_reward": 0.910714328289032, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 462.62501525878906, "epoch": 7.666666666666667, "grad_norm": 0.2108573466539383, "kl": 0.175048828125, "learning_rate": 5.038223531225742e-07, "loss": 0.0587, "reward": 0.09107143245637417, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9107143133878708, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 374.7321586608887, "epoch": 7.682170542635659, "grad_norm": 0.3733845353126526, "kl": 0.33349609375, "learning_rate": 5.030203103954232e-07, "loss": 0.0632, "reward": 0.09464286454021931, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9464285969734192, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 417.73216247558594, "epoch": 7.6976744186046515, "grad_norm": 0.43952879309654236, "kl": 1.1201171875, "learning_rate": 5.023125464358026e-07, "loss": 0.0543, "reward": 0.09285715036094189, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.9285714626312256, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 549.8393020629883, "epoch": 7.713178294573644, "grad_norm": 0.29617685079574585, "kl": 0.201416015625, "learning_rate": 5.016990909400709e-07, "loss": 0.113, "reward": 0.0857142936438322, "reward_std": 0.015152289066463709, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428954601288, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 558.8393020629883, "epoch": 7.728682170542635, "grad_norm": 0.6212316155433655, "kl": 0.447265625, "learning_rate": 5.011799696475915e-07, "loss": 0.1539, "reward": 0.0857142936438322, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.85714291036129, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 492.17860412597656, "epoch": 7.7441860465116275, "grad_norm": 0.23731939494609833, "kl": 0.216064453125, "learning_rate": 5.007552043396547e-07, "loss": 0.071, "reward": 0.0892857201397419, "reward_std": 0.01010152604430914, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8928571790456772, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 559.7321701049805, "epoch": 7.75968992248062, "grad_norm": 0.44135406613349915, "kl": 0.2109375, "learning_rate": 5.004248128385618e-07, "loss": 0.149, "reward": 0.08571429178118706, "reward_std": 0.020203051855787635, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571428805589676, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 568.7321624755859, "epoch": 7.775193798449612, "grad_norm": 0.259131520986557, "kl": 0.3043212890625, "learning_rate": 5.001888090068784e-07, "loss": 0.1772, "reward": 0.08392857760190964, "reward_std": 0.022728433599695563, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 525.1607437133789, "epoch": 7.790697674418604, "grad_norm": 0.23898068070411682, "kl": 0.278564453125, "learning_rate": 5.000472027468528e-07, "loss": 0.1237, "reward": 0.08750000968575478, "reward_std": 0.017677670577540994, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8750000596046448, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 524.2143173217773, "epoch": 7.8062015503875966, "grad_norm": 0.4151113033294678, "kl": 1.75830078125, "learning_rate": 5.000000000000001e-07, "loss": 0.0621, "reward": 0.08392857946455479, "reward_std": 0.0075761445332318544, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8392857611179352, "step": 500 }, { "epoch": 7.8062015503875966, "step": 500, "total_flos": 0.0, "train_loss": 0.059448377258595884, "train_runtime": 22252.9361, "train_samples_per_second": 1.258, "train_steps_per_second": 0.022 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }