{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987438399845395, "eval_steps": 100, "global_step": 646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 405.56965885162356, "epoch": 0.007730215479756498, "grad_norm": 0.7462543657527252, "kl": 0.0005312919616699219, "learning_rate": 1.5384615384615387e-06, "loss": 0.0, "reward": 0.7369470089673996, "reward_std": 0.5109932633116842, "rewards/accuracy_reward": 0.16250000894069672, "rewards/cosine_scaled_reward": -0.08805303737608483, "rewards/format_reward": 0.4776785926893353, "rewards/reasoning_steps_reward": 0.1848214427009225, "step": 5 }, { "completion_length": 382.88305282592773, "epoch": 0.015460430959512996, "grad_norm": 1.2904499591788734, "kl": 0.0057563304901123045, "learning_rate": 3.0769230769230774e-06, "loss": 0.0002, "reward": 0.7364962549880147, "reward_std": 0.49372123572975396, "rewards/accuracy_reward": 0.14464286556467415, "rewards/cosine_scaled_reward": -0.10219426036346704, "rewards/format_reward": 0.5366071660071612, "rewards/reasoning_steps_reward": 0.15744048808701336, "step": 10 }, { "completion_length": 207.46251001358033, "epoch": 0.023190646439269495, "grad_norm": 1.9645001561346451, "kl": 0.09119796752929688, "learning_rate": 4.615384615384616e-06, "loss": 0.0036, "reward": 0.8926077246665954, "reward_std": 0.35459292340092363, "rewards/accuracy_reward": 0.07142857508733869, "rewards/cosine_scaled_reward": -0.08804708741954528, "rewards/format_reward": 0.8491071783006191, "rewards/reasoning_steps_reward": 0.0601190519053489, "step": 15 }, { "completion_length": 136.9875066280365, "epoch": 0.03092086191902599, "grad_norm": 3.03448748706415, "kl": 0.20247802734375, "learning_rate": 6.153846153846155e-06, "loss": 0.0081, "reward": 0.9553401380777359, "reward_std": 0.2689471357734874, "rewards/accuracy_reward": 0.06696428917348385, "rewards/cosine_scaled_reward": -0.03483848163741641, "rewards/format_reward": 0.882142897695303, "rewards/reasoning_steps_reward": 0.041071431757882235, "step": 20 }, { "completion_length": 105.67768306732178, "epoch": 0.03865107739878249, "grad_norm": 1.2320400007971612, "kl": 0.316986083984375, "learning_rate": 7.692307692307694e-06, "loss": 0.0127, "reward": 0.9907999753952026, "reward_std": 0.26647315019290546, "rewards/accuracy_reward": 0.0750000043772161, "rewards/cosine_scaled_reward": 0.008061838883440942, "rewards/format_reward": 0.8705357514321804, "rewards/reasoning_steps_reward": 0.03720238357782364, "step": 25 }, { "completion_length": 37.38125160932541, "epoch": 0.04638129287853899, "grad_norm": 1.25978990604801, "kl": 0.8052734375, "learning_rate": 9.230769230769232e-06, "loss": 0.0322, "reward": 1.0163824677467346, "reward_std": 0.08920623137296388, "rewards/accuracy_reward": 0.029464287217706442, "rewards/cosine_scaled_reward": 0.007156228994062985, "rewards/format_reward": 0.9687500141561032, "rewards/reasoning_steps_reward": 0.011011905502527952, "step": 30 }, { "completion_length": 18.889286601543425, "epoch": 0.054111508358295486, "grad_norm": 2.5055376347575145, "kl": 1.8021484375, "learning_rate": 1.076923076923077e-05, "loss": 0.072, "reward": 0.9922278635203838, "reward_std": 0.0161491093209861, "rewards/accuracy_reward": 0.0026785715483129023, "rewards/cosine_scaled_reward": -0.0039031302643707023, "rewards/format_reward": 0.9919642895460129, "rewards/reasoning_steps_reward": 0.0014880953822284937, "step": 35 }, { "completion_length": 21.922322404384612, "epoch": 0.06184172383805198, "grad_norm": 0.8899563849909223, "kl": 1.80244140625, "learning_rate": 1.230769230769231e-05, "loss": 0.0721, "reward": 0.9816747322678566, "reward_std": 0.030321285296668065, "rewards/accuracy_reward": 0.001785714365541935, "rewards/cosine_scaled_reward": -0.003741989450645633, "rewards/format_reward": 0.9812500089406967, "rewards/reasoning_steps_reward": 0.0023809525649994613, "step": 40 }, { "completion_length": 14.261607849597931, "epoch": 0.06957193931780849, "grad_norm": 1.4119055096863828, "kl": 1.7033203125, "learning_rate": 1.3846153846153847e-05, "loss": 0.0681, "reward": 0.9888324901461601, "reward_std": 0.013765288369779683, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.001643749103823211, "rewards/format_reward": 0.9901785761117935, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 45 }, { "completion_length": 72.1785749554634, "epoch": 0.07730215479756498, "grad_norm": 1.210848250852318, "kl": 1.85546875, "learning_rate": 1.5384615384615387e-05, "loss": 0.0742, "reward": 0.9152212306857109, "reward_std": 0.0736794498127665, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021981198005960323, "rewards/format_reward": 0.936607152968645, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 50 }, { "completion_length": 14.961607897281647, "epoch": 0.08503237027732148, "grad_norm": 0.4500475012306279, "kl": 3.1853515625, "learning_rate": 1.6923076923076924e-05, "loss": 0.1274, "reward": 0.9923057183623314, "reward_std": 0.009002923837056188, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0023372016948997043, "rewards/format_reward": 0.9946428596973419, "rewards/reasoning_steps_reward": 0.0, "step": 55 }, { "completion_length": 14.094643568992614, "epoch": 0.09276258575707798, "grad_norm": 59.52949637785707, "kl": 5.4390625, "learning_rate": 1.8461538461538465e-05, "loss": 0.2176, "reward": 0.18925664581884122, "reward_std": 0.024113523228817258, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0038981260504897365, "rewards/format_reward": 0.1928571481257677, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 60 }, { "completion_length": 4.946428689360618, "epoch": 0.10049280123683448, "grad_norm": 2.2826304271189377, "kl": 6.75234375, "learning_rate": 2e-05, "loss": 0.27, "reward": -0.0009235852706297009, "reward_std": 0.0012105709233253491, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0012212042873215978, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 65 }, { "completion_length": 67.41250307410955, "epoch": 0.10822301671659097, "grad_norm": 15.03928087271465, "kl": 16.14609375, "learning_rate": 1.999634547413886e-05, "loss": 0.6457, "reward": -0.0246695591218554, "reward_std": 0.03053633879125641, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025860035596349463, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0011904762592166661, "step": 70 }, { "completion_length": 56.75000178366899, "epoch": 0.11595323219634747, "grad_norm": 0.19042216089252142, "kl": 21.287109375, "learning_rate": 1.9985384567667278e-05, "loss": 0.8517, "reward": -0.023805892881091494, "reward_std": 0.02873828581006137, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02440113128282064, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 75 }, { "completion_length": 1.0008928582072258, "epoch": 0.12368344767610397, "grad_norm": 0.012503826974021894, "kl": 25.846875, "learning_rate": 1.9967125291968495e-05, "loss": 1.0338, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": 0.0, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 80 }, { "completion_length": 1.0, "epoch": 0.13141366315586048, "grad_norm": 0.023217182066212497, "kl": 25.7984375, "learning_rate": 1.9941580992841562e-05, "loss": 1.032, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": 0.0, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 85 }, { "completion_length": 1.0, "epoch": 0.13914387863561697, "grad_norm": 0.20269584411345673, "kl": 25.7359375, "learning_rate": 1.990877034074683e-05, "loss": 1.0296, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": 0.0, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 90 }, { "completion_length": 548.6687725067138, "epoch": 0.14687409411537347, "grad_norm": 5147.653288696975, "kl": 79.7378173828125, "learning_rate": 1.9868717317159617e-05, "loss": 3.1936, "reward": -0.14213015870191156, "reward_std": 0.09109379828441888, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14242777726612985, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 95 }, { "completion_length": 821.0839622497558, "epoch": 0.15460430959512997, "grad_norm": 27.29688379902277, "kl": 1197.9787109375, "learning_rate": 1.9821451197042028e-05, "loss": 47.9928, "reward": -0.1396964536048472, "reward_std": 0.10962239899672568, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14148216703906655, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0017857144586741925, "step": 100 }, { "epoch": 0.15460430959512997, "eval_completion_length": 321.4732246398926, "eval_kl": 8.53125, "eval_loss": 0.34084638953208923, "eval_reward": -0.022001524968800368, "eval_reward_std": 0.03211871034000069, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.023489620012696832, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0014880953822284937, "eval_runtime": 48.9303, "eval_samples_per_second": 2.023, "eval_steps_per_second": 0.082, "step": 100 }, { "completion_length": 104.91607556939125, "epoch": 0.16233452507488647, "grad_norm": 1.1693211625065207, "kl": 11.8296875, "learning_rate": 1.9767006527445728e-05, "loss": 0.4732, "reward": -0.011938850855904093, "reward_std": 0.015576600710505772, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01193885096483882, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 105 }, { "completion_length": 58.97053833901882, "epoch": 0.17006474055464296, "grad_norm": 0.7094878044712477, "kl": 11.401171875, "learning_rate": 1.9705423102261324e-05, "loss": 0.4561, "reward": -0.014024021557702326, "reward_std": 0.017935297856251965, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.014619259324831546, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 110 }, { "completion_length": 93.93661117255688, "epoch": 0.17779495603439946, "grad_norm": 5.2388464492760765, "kl": 11.269140625, "learning_rate": 1.9636745933132807e-05, "loss": 0.4507, "reward": -0.0299409940995929, "reward_std": 0.031794142858944954, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.030238612740163262, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 115 }, { "completion_length": 48.877681124210355, "epoch": 0.18552517151415596, "grad_norm": 680.6777574886382, "kl": 215.988671875, "learning_rate": 1.956102521655831e-05, "loss": 8.6316, "reward": -0.003459748199139767, "reward_std": 0.01691803910527767, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.010602605762341, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.007142857694998383, "step": 120 }, { "completion_length": 170.71250774115325, "epoch": 0.19325538699391245, "grad_norm": 7.712555567920521, "kl": 18.38515625, "learning_rate": 1.9478316297201218e-05, "loss": 0.735, "reward": -0.017600624014572474, "reward_std": 0.05708557971850894, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0423030068389707, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.024702383019030095, "step": 125 }, { "completion_length": 47.75714521855116, "epoch": 0.20098560247366895, "grad_norm": 0.5989444454610684, "kl": 21.9390625, "learning_rate": 1.9388679627438486e-05, "loss": 0.8777, "reward": -0.017952405636268143, "reward_std": 0.0257039325139214, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.020630977310443742, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.002678571688011289, "step": 130 }, { "completion_length": 63.475896288454535, "epoch": 0.20871581795342545, "grad_norm": 1.3923219106012577, "kl": 22.10546875, "learning_rate": 1.9292180723175656e-05, "loss": 0.8842, "reward": -0.020242841815515077, "reward_std": 0.031060412585736684, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026790461270310574, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.006547619681805372, "step": 135 }, { "completion_length": 55.86518155187368, "epoch": 0.21644603343318194, "grad_norm": 5.0902986832024775, "kl": 22.8453125, "learning_rate": 1.9188890115960967e-05, "loss": 0.9137, "reward": -0.01244992250547421, "reward_std": 0.019453834210839994, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.018402304255588663, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.005952381482347846, "step": 140 }, { "completion_length": 76.11786096841097, "epoch": 0.22417624891293844, "grad_norm": 0.21422345894603065, "kl": 22.56484375, "learning_rate": 1.9078883301433488e-05, "loss": 0.9024, "reward": -0.013156807044567209, "reward_std": 0.02494261453293838, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01970442655829796, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.006547619635239243, "step": 145 }, { "completion_length": 1.0053571447730065, "epoch": 0.23190646439269494, "grad_norm": 0.0780953061053383, "kl": 25.6546875, "learning_rate": 1.8962240684142923e-05, "loss": 1.026, "reward": -3.634977332467315e-08, "reward_std": 5.140634300460078e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.634977332467315e-08, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 150 }, { "completion_length": 1.3276785850524901, "epoch": 0.23963667987245144, "grad_norm": 0.03077389791272203, "kl": 25.740625, "learning_rate": 1.883904751878156e-05, "loss": 1.0296, "reward": -0.0002941889921203256, "reward_std": 0.000416046055033803, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0002941889921203256, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 155 }, { "completion_length": 1.0, "epoch": 0.24736689535220793, "grad_norm": 0.005409651293107311, "kl": 25.7171875, "learning_rate": 1.8709393847871146e-05, "loss": 1.0287, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": 0.0, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 160 }, { "completion_length": 1.0017857149243354, "epoch": 0.25509711083196446, "grad_norm": 0.006270140344596234, "kl": 25.6875, "learning_rate": 1.857337443595034e-05, "loss": 1.0277, "reward": -1.8613073962114868e-07, "reward_std": 2.6322861685912357e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.8613073962114868e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 165 }, { "completion_length": 1.3258928775787353, "epoch": 0.26282732631172095, "grad_norm": 0.011192815865153746, "kl": 25.7875, "learning_rate": 1.8431088700310846e-05, "loss": 1.0315, "reward": 2.2296422685030848e-05, "reward_std": 3.1531901913695035e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0002753226552158594, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 170 }, { "completion_length": 1.0008928582072258, "epoch": 0.27055754179147745, "grad_norm": 0.21322386913432315, "kl": 25.7890625, "learning_rate": 1.8282640638332773e-05, "loss": 1.0316, "reward": -1.1015175438444658e-09, "reward_std": 1.557780926475516e-09, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1015175438444658e-09, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 175 }, { "completion_length": 21.52053656876087, "epoch": 0.27828775727123395, "grad_norm": 3.021771451610135, "kl": 17.896875, "learning_rate": 1.8128138751472432e-05, "loss": 0.7157, "reward": -0.006260214662032926, "reward_std": 0.011234662607045287, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.008045928952500336, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0017857144586741925, "step": 180 }, { "completion_length": 25.840179923176766, "epoch": 0.28601797275099045, "grad_norm": 0.06045436907992946, "kl": 14.9546875, "learning_rate": 1.7967695965958044e-05, "loss": 0.5985, "reward": -0.014501136258346037, "reward_std": 0.022527561223525793, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01717970742170678, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00267857164144516, "step": 185 }, { "completion_length": 40.05446655750275, "epoch": 0.29374818823074694, "grad_norm": 0.4850802805739142, "kl": 14.3734375, "learning_rate": 1.780142955025139e-05, "loss": 0.5749, "reward": -0.01828156500302498, "reward_std": 0.015761358443027618, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01828156588777006, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 190 }, { "completion_length": 4.553571638464928, "epoch": 0.30147840371050344, "grad_norm": 0.6589006761117226, "kl": 11.007421875, "learning_rate": 1.7629461029335683e-05, "loss": 0.4403, "reward": -2.3027300318290145e-05, "reward_std": 1.0056229485755353e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.302729985217411e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 195 }, { "completion_length": 7.213393118977547, "epoch": 0.30920861919025994, "grad_norm": 1.7478190427427973, "kl": 11.348828125, "learning_rate": 1.745191609589231e-05, "loss": 0.454, "reward": -0.00012043448294605242, "reward_std": 5.862412571779885e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00012043448467409234, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 200 }, { "epoch": 0.30920861919025994, "eval_completion_length": 2.0, "eval_kl": 11.609375, "eval_loss": 0.4632655084133148, "eval_reward": -1.2336995496298186e-06, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -1.2336995496298186e-06, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0, "eval_runtime": 3.9058, "eval_samples_per_second": 25.347, "eval_steps_per_second": 1.024, "step": 200 }, { "completion_length": 1.2241071552038192, "epoch": 0.31693883467001643, "grad_norm": 0.03938926661752981, "kl": 22.55859375, "learning_rate": 1.7268924518431437e-05, "loss": 0.9023, "reward": -3.0622175763994617e-07, "reward_std": 7.165778619366847e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.0622175763994617e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 205 }, { "completion_length": 1.0035714328289032, "epoch": 0.32466905014977293, "grad_norm": 0.2716599680471258, "kl": 25.7859375, "learning_rate": 1.7080620046443503e-05, "loss": 1.0313, "reward": -1.376801691321816e-07, "reward_std": 1.9470915901820263e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.376801691321816e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 210 }, { "completion_length": 1.0017857164144517, "epoch": 0.33239926562952943, "grad_norm": 0.06638642787452904, "kl": 25.753125, "learning_rate": 1.6887140312641036e-05, "loss": 1.0303, "reward": -8.040706021006371e-08, "reward_std": 1.1371276684712938e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -8.040706021006371e-08, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 215 }, { "completion_length": 1.0125000104308128, "epoch": 0.3401294811092859, "grad_norm": 0.47431766351331595, "kl": 25.7171875, "learning_rate": 1.6688626732362192e-05, "loss": 1.0287, "reward": -6.87287168332773e-07, "reward_std": 9.719708138788973e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.87287168332773e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 220 }, { "completion_length": 28.308037100732328, "epoch": 0.3478596965890424, "grad_norm": 1.2226888752634715, "kl": 18.17109375, "learning_rate": 1.6485224400209557e-05, "loss": 0.7269, "reward": -0.013642355828355424, "reward_std": 0.015292952457497755, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01364235559717324, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 225 }, { "completion_length": 1.9526786044239999, "epoch": 0.3555899120687989, "grad_norm": 0.4342653521948109, "kl": 19.8859375, "learning_rate": 1.6277081983999742e-05, "loss": 0.7956, "reward": -1.082232572571229e-05, "reward_std": 9.206240498826901e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.082232524822757e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 230 }, { "completion_length": 2.000000002980232, "epoch": 0.3633201275485554, "grad_norm": 0.004027750934258561, "kl": 20.1640625, "learning_rate": 1.6064351616101318e-05, "loss": 0.8066, "reward": -1.1172613938015274e-05, "reward_std": 1.2617196887276805e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1172613096732675e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 235 }, { "completion_length": 2.0, "epoch": 0.3710503430283119, "grad_norm": 0.00610052275107364, "kl": 20.18125, "learning_rate": 1.5847188782240473e-05, "loss": 0.8071, "reward": -1.1103224096586928e-05, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1103223187092226e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 240 }, { "completion_length": 2.000892859697342, "epoch": 0.3787805585080684, "grad_norm": 0.008523403395785032, "kl": 20.1515625, "learning_rate": 1.562575220785569e-05, "loss": 0.8063, "reward": -1.1147282725687546e-05, "reward_std": 6.230831672837667e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1147281827561529e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 245 }, { "completion_length": 2.0, "epoch": 0.3865107739878249, "grad_norm": 0.04067722857474135, "kl": 20.1734375, "learning_rate": 1.5400203742084508e-05, "loss": 0.8069, "reward": -1.1103224096586928e-05, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1103223187092226e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 250 }, { "completion_length": 2.2633929014205934, "epoch": 0.3942409894675814, "grad_norm": 48542.5507771523, "kl": 263.5796875, "learning_rate": 1.5170708239467143e-05, "loss": 10.5377, "reward": -3.0240616274568312e-05, "reward_std": 9.291260630561737e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.0240615694765437e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 255 }, { "completion_length": 2.563392972946167, "epoch": 0.4019712049473379, "grad_norm": 2.4546075517067383, "kl": 1508.42734375, "learning_rate": 1.4937433439453465e-05, "loss": 60.4181, "reward": -5.116900102848376e-05, "reward_std": 1.7354888933596156e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -5.116900136954427e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 260 }, { "completion_length": 3.5651787251234053, "epoch": 0.4097014204270944, "grad_norm": 10393736.856195485, "kl": 72436.23515625, "learning_rate": 1.4700549843801359e-05, "loss": 2896.5656, "reward": -0.0004767782709336643, "reward_std": 0.0006390644782072741, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0004767782703879675, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 265 }, { "completion_length": 3.6241072833538057, "epoch": 0.4174316359068509, "grad_norm": 8.493623241215658, "kl": 32.7546875, "learning_rate": 1.4460230591956097e-05, "loss": 1.3091, "reward": -0.0004817824598603693, "reward_std": 0.0006431983984157341, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0004817824589736119, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 270 }, { "completion_length": 162.31697244644164, "epoch": 0.4251618513866074, "grad_norm": 6.964369964518173, "kl": 8.87421875, "learning_rate": 1.421665133450184e-05, "loss": 0.3548, "reward": -0.07739254070850166, "reward_std": 0.062141555817038355, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0773925410838956, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 275 }, { "completion_length": 96.25625440478325, "epoch": 0.4328920668663639, "grad_norm": 9.681191404680547, "kl": 17.048046875, "learning_rate": 1.3969990104777712e-05, "loss": 0.6819, "reward": -0.03923700120972171, "reward_std": 0.03926289372857354, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.039237000942125635, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 280 }, { "completion_length": 3.1446429938077927, "epoch": 0.4406222823461204, "grad_norm": 1.6939135445363234, "kl": 19.3328125, "learning_rate": 1.3720427188752306e-05, "loss": 0.7732, "reward": -0.00017149213965694798, "reward_std": 0.0002037051583556604, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00017149214137361922, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 285 }, { "completion_length": 2.826785835623741, "epoch": 0.4483524978258769, "grad_norm": 23.667469043825736, "kl": 21.6515625, "learning_rate": 1.3468144993251735e-05, "loss": 0.8657, "reward": -0.00012078473635028786, "reward_std": 0.0001242638816734143, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00012078473641849997, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 290 }, { "completion_length": 2.306250122189522, "epoch": 0.4560827133056334, "grad_norm": 3.4891254852910563, "kl": 16.62890625, "learning_rate": 1.3213327912637563e-05, "loss": 0.6651, "reward": -4.597324716542062e-05, "reward_std": 4.538521250552208e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.597324708583983e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 295 }, { "completion_length": 2.0589286506175997, "epoch": 0.4638129287853899, "grad_norm": 7.581171823316156, "kl": 4328.1982421875, "learning_rate": 1.295616219403197e-05, "loss": 173.2372, "reward": -1.8399331577256816e-05, "reward_std": 1.0234137062070658e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.839933122482762e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 300 }, { "epoch": 0.4638129287853899, "eval_completion_length": 2.013392925262451, "eval_kl": 9.3125, "eval_loss": 0.3736799657344818, "eval_reward": -1.2055932870680408e-05, "eval_reward_std": 1.4875331686425852e-06, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -1.2055932074872544e-05, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0, "eval_runtime": 3.909, "eval_samples_per_second": 25.326, "eval_steps_per_second": 1.023, "step": 300 }, { "completion_length": 2.028571492433548, "epoch": 0.4715431442651464, "grad_norm": 344.48530790093184, "kl": 10.771484375, "learning_rate": 1.2696835801188816e-05, "loss": 0.4306, "reward": -1.3102200352932413e-05, "reward_std": 2.683673558578903e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.3102199898185062e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 305 }, { "completion_length": 2.159821531176567, "epoch": 0.47927335974490287, "grad_norm": 70.20341939548675, "kl": 4.1146484375, "learning_rate": 1.2435538277109919e-05, "loss": 0.1645, "reward": -3.179889416742299e-05, "reward_std": 2.5794675986645645e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.1798893940049314e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 310 }, { "completion_length": 3.5107144564390182, "epoch": 0.48700357522465937, "grad_norm": 641.6783163094212, "kl": 33.4169921875, "learning_rate": 1.2172460605507126e-05, "loss": 1.3355, "reward": -0.0001592895164321817, "reward_std": 0.00014331816136063934, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.000159289515522687, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 315 }, { "completion_length": 3.3508930176496508, "epoch": 0.49473379070441587, "grad_norm": 145.55967385563756, "kl": 16.3171875, "learning_rate": 1.19077950712113e-05, "loss": 0.653, "reward": -0.00014978365513798053, "reward_std": 0.00014508154267787177, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00014978365538809157, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 320 }, { "completion_length": 2.4000001162290574, "epoch": 0.5024640061841724, "grad_norm": 34.2925539485873, "kl": 90.2671875, "learning_rate": 1.1641735119630373e-05, "loss": 3.6146, "reward": -3.094260448506247e-05, "reward_std": 2.256790549353127e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.094260480338562e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 325 }, { "completion_length": 2.073214355111122, "epoch": 0.5101942216639289, "grad_norm": 100.58626957516375, "kl": 15.82890625, "learning_rate": 1.137447521535908e-05, "loss": 0.6332, "reward": -1.346574514400345e-05, "reward_std": 3.1479559218894336e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.346574463241268e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 330 }, { "completion_length": 2.016964301466942, "epoch": 0.5179244371436854, "grad_norm": 80.23510373962755, "kl": 6.93681640625, "learning_rate": 1.110621070004378e-05, "loss": 0.2775, "reward": -1.15173755602882e-05, "reward_std": 5.856986717844848e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1517374741742969e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 335 }, { "completion_length": 3.837500014901161, "epoch": 0.5256546526234419, "grad_norm": 54.951541452460454, "kl": 5.483740234375, "learning_rate": 1.0837137649606241e-05, "loss": 0.2192, "reward": -0.00045855455310857, "reward_std": 1.340229282504879e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0004585545523241308, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 340 }, { "completion_length": 2.250892898440361, "epoch": 0.5333848681031984, "grad_norm": 5.7796641627939955, "kl": 3.30419921875, "learning_rate": 1.0567452730930743e-05, "loss": 0.1322, "reward": -9.29596154378487e-05, "reward_std": 5.985533989090186e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -9.295961492625793e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 345 }, { "completion_length": 2.010714316368103, "epoch": 0.5411150835829549, "grad_norm": 5.993181817882121, "kl": 18.634033203125, "learning_rate": 1.0297353058119209e-05, "loss": 0.7487, "reward": -1.1347752183610282e-05, "reward_std": 3.7385478890428203e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1347751353696367e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 350 }, { "completion_length": 2.031250071525574, "epoch": 0.5488452990627114, "grad_norm": 25.848585080207908, "kl": 3.104638671875, "learning_rate": 1.0027036048419514e-05, "loss": 0.1242, "reward": -1.1796055684953898e-05, "reward_std": 1.0358914863672908e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.1796055014201557e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 355 }, { "completion_length": 2.9714287370443344, "epoch": 0.5565755145424679, "grad_norm": 93.38163993347811, "kl": 8.349462890625, "learning_rate": 9.756699277932196e-06, "loss": 0.3341, "reward": -4.401462517762411e-05, "reward_std": 3.163113681239338e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.401462485930097e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 360 }, { "completion_length": 8.554464668035507, "epoch": 0.5643057300222244, "grad_norm": 10.140627474325306, "kl": 8.22578125, "learning_rate": 9.486540337201046e-06, "loss": 0.329, "reward": -0.0005188777377952647, "reward_std": 0.0003973965808199864, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0005188777389776078, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 365 }, { "completion_length": 54.99375256896019, "epoch": 0.5720359455019809, "grad_norm": 5.957561618335069, "kl": 6.14736328125, "learning_rate": 9.216756686793163e-06, "loss": 0.246, "reward": -0.048145094428764426, "reward_std": 0.03985454668636521, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04814509411298786, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 370 }, { "completion_length": 2.9062501341104507, "epoch": 0.5797661609817374, "grad_norm": 3.442877067184154, "kl": 14.48828125, "learning_rate": 8.94754551297402e-06, "loss": 0.5794, "reward": -6.0367731987298614e-05, "reward_std": 5.186144211108967e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.036773227151571e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 375 }, { "completion_length": 1.7392858043313026, "epoch": 0.5874963764614939, "grad_norm": 2.5960117069670465, "kl": 15.17109375, "learning_rate": 8.67910358358298e-06, "loss": 0.6067, "reward": -9.306566921196691e-06, "reward_std": 3.4472319889289336e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -9.306566844458076e-06, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 380 }, { "completion_length": 1.0375000193715096, "epoch": 0.5952265919412504, "grad_norm": 0.25768097061230005, "kl": 25.1875, "learning_rate": 8.411627104214675e-06, "loss": 1.0077, "reward": -4.163708844373559e-07, "reward_std": 2.523588761960127e-07, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.163708844373559e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 385 }, { "completion_length": 1.020535722374916, "epoch": 0.6029568074210069, "grad_norm": 0.2792979999202264, "kl": 25.475, "learning_rate": 8.145311574811325e-06, "loss": 1.0192, "reward": -2.203019718649557e-07, "reward_std": 3.1155297364193756e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.203019718649557e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 390 }, { "completion_length": 1.0116071477532387, "epoch": 0.6106870229007634, "grad_norm": 0.3039976370074429, "kl": 25.5734375, "learning_rate": 7.880351646770824e-06, "loss": 1.0227, "reward": -1.2887670166605858e-07, "reward_std": 1.4019937566445151e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.2887670166605858e-07, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 395 }, { "completion_length": 1.0044642880558967, "epoch": 0.6184172383805199, "grad_norm": 14.943634375914652, "kl": 23.6125, "learning_rate": 7.616940980675004e-06, "loss": 0.9445, "reward": -4.95679621792533e-08, "reward_std": 1.4019937566445151e-08, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.95679621792533e-08, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 400 }, { "epoch": 0.6184172383805199, "eval_completion_length": 1.008928582072258, "eval_kl": 12.2890625, "eval_loss": 0.49081891775131226, "eval_reward": -9.91359243585066e-08, "eval_reward_std": 1.401993756644515e-07, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -9.91359243585066e-08, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0, "eval_runtime": 3.9135, "eval_samples_per_second": 25.297, "eval_steps_per_second": 1.022, "step": 400 }, { "completion_length": 1.7696429282426833, "epoch": 0.6261474538602764, "grad_norm": 12983.515555809317, "kl": 293.42099609375, "learning_rate": 7.355272104742132e-06, "loss": 11.7569, "reward": -6.9121698422769384e-06, "reward_std": 6.5103086452467094e-06, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.912169713668703e-06, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 405 }, { "completion_length": 17.47142935991287, "epoch": 0.6338776693400329, "grad_norm": 24.820078172147845, "kl": 117.49140625, "learning_rate": 7.095536274107046e-06, "loss": 4.7063, "reward": -0.0006443509351811372, "reward_std": 0.0011860909646429717, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0009419700301350531, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 410 }, { "completion_length": 23.688393944501875, "epoch": 0.6416078848197894, "grad_norm": 16.557827311293913, "kl": 9.744921875, "learning_rate": 6.837923331031761e-06, "loss": 0.3898, "reward": -0.0012267566772607096, "reward_std": 0.005331564269795308, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0030124710897553087, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0017857144586741925, "step": 415 }, { "completion_length": 3.3517858684062958, "epoch": 0.6493381002995459, "grad_norm": 1.692182099435314, "kl": 16.48828125, "learning_rate": 6.58262156614881e-06, "loss": 0.6594, "reward": 0.0001806508086886538, "reward_std": 0.0005466543234465604, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00011696825112039733, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 420 }, { "completion_length": 8.130357471108436, "epoch": 0.6570683157793024, "grad_norm": 49972.11357533271, "kl": 602.6296875, "learning_rate": 6.3298175808386284e-06, "loss": 24.2147, "reward": 0.002693667745747774, "reward_std": 0.007418848656948284, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0017706183462685488, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.004464286146685481, "step": 425 }, { "completion_length": 37.81518050432205, "epoch": 0.6647985312590589, "grad_norm": 997499.4960779066, "kl": 5904.2171875, "learning_rate": 6.079696150841634e-06, "loss": 236.481, "reward": 0.0069611702572728975, "reward_std": 0.04231391001048905, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025181690172394154, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.032142859697341916, "step": 430 }, { "completion_length": 80.26518244743347, "epoch": 0.6725287467388154, "grad_norm": 1679.9201550809582, "kl": 93.42734375, "learning_rate": 5.832440091204698e-06, "loss": 3.7362, "reward": 0.017997386856882257, "reward_std": 0.08549039482022636, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04718118982855231, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0651785762514919, "step": 435 }, { "completion_length": 48.38482351303101, "epoch": 0.6802589622185718, "grad_norm": 1961.7217230617018, "kl": 92.44609375, "learning_rate": 5.588230122660672e-06, "loss": 3.6949, "reward": -0.001163672623806633, "reward_std": 0.022728995434590615, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.011877958994591608, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.010714286705479026, "step": 440 }, { "completion_length": 25.872322690486907, "epoch": 0.6879891776983283, "grad_norm": 366.54152281214704, "kl": 27.91484375, "learning_rate": 5.347244739538677e-06, "loss": 1.116, "reward": 0.003250818374363007, "reward_std": 0.0114803760588984, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0032968010993499773, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.006547619681805372, "step": 445 }, { "completion_length": 13.839286398887634, "epoch": 0.6957193931780848, "grad_norm": 42.50107796477912, "kl": 13.202734375, "learning_rate": 5.109660079301668e-06, "loss": 0.5286, "reward": 0.0055045452249032675, "reward_std": 0.010016416979851783, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0010430743366669048, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.006547619681805372, "step": 450 }, { "completion_length": 10.257143294811248, "epoch": 0.7034496086578413, "grad_norm": 21.172769458024856, "kl": 10.9296875, "learning_rate": 4.875649793806655e-06, "loss": 0.4372, "reward": 0.008970338363269548, "reward_std": 0.013051454542801367, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0005534718308808806, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00952381044626236, "step": 455 }, { "completion_length": 4.476785945892334, "epoch": 0.7111798241375978, "grad_norm": 16114.931690998841, "kl": 80.304296875, "learning_rate": 4.64538492238166e-06, "loss": 3.2165, "reward": 0.0016558387350755766, "reward_std": 0.002667017420742468, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00012987566916535797, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0017857144586741925, "step": 460 }, { "completion_length": 3.0580358475446703, "epoch": 0.7189100396173543, "grad_norm": 8.763534475555268, "kl": 13.3734375, "learning_rate": 4.4190337668121964e-06, "loss": 0.5347, "reward": 0.0017347485314360255, "reward_std": 0.0025775103481024074, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -5.096584497437107e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0017857144586741925, "step": 465 }, { "completion_length": 3.4544644355773926, "epoch": 0.7266402550971108, "grad_norm": 59.69799948012789, "kl": 11.61796875, "learning_rate": 4.196761768328599e-06, "loss": 0.4648, "reward": 0.0034990686553499017, "reward_std": 0.004287067790926358, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -7.236019353626944e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.003571428917348385, "step": 470 }, { "completion_length": 4.27053590118885, "epoch": 0.7343704705768673, "grad_norm": 1031.7322972051127, "kl": 43.451171875, "learning_rate": 3.978731386684206e-06, "loss": 1.7423, "reward": 0.0020133760637264684, "reward_std": 0.00301552176138955, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.995749386646821e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0020833335351198913, "step": 475 }, { "completion_length": 4.559821632504463, "epoch": 0.7421006860566238, "grad_norm": 9.238057518751118, "kl": 11.9234375, "learning_rate": 3.7651019814126656e-06, "loss": 0.4769, "reward": 0.002013250943150524, "reward_std": 0.0030125229287023105, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -7.00825786225323e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0020833335351198913, "step": 480 }, { "completion_length": 4.471428775787354, "epoch": 0.7498309015363803, "grad_norm": 429.31107066607035, "kl": 12.3765625, "learning_rate": 3.5560296953512296e-06, "loss": 0.4952, "reward": 0.0032121944199275275, "reward_std": 0.004687485893919074, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.161540276252708e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.003273809840902686, "step": 485 }, { "completion_length": 3.438393014669418, "epoch": 0.7575611170161368, "grad_norm": 11.974760296352384, "kl": 12.7328125, "learning_rate": 3.3516673405151546e-06, "loss": 0.5094, "reward": 0.0014523516780712952, "reward_std": 0.0021428495080442643, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.574375780317496e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0014880953822284937, "step": 490 }, { "completion_length": 2.6714286893606185, "epoch": 0.7652913324958933, "grad_norm": 13.522851975241164, "kl": 13.8265625, "learning_rate": 3.1521642864065905e-06, "loss": 0.5533, "reward": 0.0008747298558859029, "reward_std": 0.00128467223768709, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.812734629567103e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0008928572293370962, "step": 495 }, { "completion_length": 2.2830358266830446, "epoch": 0.7730215479756498, "grad_norm": 22.821443928260532, "kl": 13.73671875, "learning_rate": 2.957666350839663e-06, "loss": 0.5495, "reward": 0.00028535680104226915, "reward_std": 0.0004355434665169966, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.2262285542874451e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 500 }, { "epoch": 0.7730215479756498, "eval_completion_length": 2.0568453520536423, "eval_kl": 305160.84375, "eval_loss": 12955.07421875, "eval_reward": -1.178500150444961e-05, "eval_reward_std": 1.4579100025002845e-05, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -1.1785001447606192e-05, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0, "eval_runtime": 4.1651, "eval_samples_per_second": 23.769, "eval_steps_per_second": 0.96, "step": 500 }, { "completion_length": 2.651785823702812, "epoch": 0.7807517634554063, "grad_norm": 17.159647997630483, "kl": 24.97421875, "learning_rate": 2.768315693361474e-06, "loss": 1.0014, "reward": 0.0002747434171084251, "reward_std": 0.00044760271368602387, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.287564226719496e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 505 }, { "completion_length": 2.7107144027948378, "epoch": 0.7884819789351628, "grad_norm": 10.757758728255537, "kl": 12.106640625, "learning_rate": 2.5842507113469307e-06, "loss": 0.4842, "reward": 0.0002685559660832837, "reward_std": 0.0004552351290545431, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.9063133104045848e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 510 }, { "completion_length": 2.5133929789066314, "epoch": 0.7962121944149193, "grad_norm": 5.559093286294482, "kl": 14.2421875, "learning_rate": 2.405605938843416e-06, "loss": 0.5699, "reward": 0.00027485358338381617, "reward_std": 0.00044865890789385076, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.2765490465559425e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 515 }, { "completion_length": 2.5794643998146056, "epoch": 0.8039424098946758, "grad_norm": 24.703227558264665, "kl": 14.301171875, "learning_rate": 2.2325119482391466e-06, "loss": 0.5722, "reward": 0.00026913669922237207, "reward_std": 0.0004578498039998991, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.8482373353710953e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 520 }, { "completion_length": 2.5741072446107864, "epoch": 0.8116726253744323, "grad_norm": 9.65984260064757, "kl": 18.32578125, "learning_rate": 2.065095254827133e-06, "loss": 0.7334, "reward": 0.0005574991783120709, "reward_std": 0.0008894859273956612, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.773899445462803e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 525 }, { "completion_length": 2.299107253551483, "epoch": 0.8194028408541888, "grad_norm": 10071.790977447952, "kl": 42.934375, "learning_rate": 1.9034782243345074e-06, "loss": 1.7224, "reward": 0.0005820921107670074, "reward_std": 0.0008561578569807437, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.314603203184106e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 530 }, { "completion_length": 2.182142949104309, "epoch": 0.8271330563339453, "grad_norm": 9.107437497703264, "kl": 24.06640625, "learning_rate": 1.7477789834847835e-06, "loss": 0.9624, "reward": -8.68767907036272e-06, "reward_std": 1.0127159256967389e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -8.687678895569207e-06, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 535 }, { "completion_length": 2.11428582072258, "epoch": 0.8348632718137018, "grad_norm": 3133.3728909697643, "kl": 24.715625, "learning_rate": 1.5981113336584041e-06, "loss": 0.9899, "reward": 0.00029145571905218046, "reward_std": 0.0004271924964957208, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.163340658815742e-06, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 540 }, { "completion_length": 2.3785715490579604, "epoch": 0.8425934872934583, "grad_norm": 6.451324617300761, "kl": 11.47578125, "learning_rate": 1.4545846677147446e-06, "loss": 0.4591, "reward": -1.44208539481383e-05, "reward_std": 1.8191472213224814e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.442085409308902e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 545 }, { "completion_length": 2.5437501281499864, "epoch": 0.8503237027732148, "grad_norm": 8.852243659606795, "kl": 10.2953125, "learning_rate": 1.3173038900362977e-06, "loss": 0.4115, "reward": -1.7397329310142594e-05, "reward_std": 1.9344854193192873e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -1.7397329256141347e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 550 }, { "completion_length": 2.79910726249218, "epoch": 0.8580539182529713, "grad_norm": 650.819343638309, "kl": 14.238671875, "learning_rate": 1.1863693398535115e-06, "loss": 0.5704, "reward": 0.00027451140200156486, "reward_std": 0.0004485697714363468, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.3107678526912422e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 555 }, { "completion_length": 2.8803572744131087, "epoch": 0.8657841337327278, "grad_norm": 27.080578131005346, "kl": 9.0375, "learning_rate": 1.0618767179063416e-06, "loss": 0.3616, "reward": 0.00026190450445682243, "reward_std": 0.0004659716969253935, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.5714515220774956e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 560 }, { "completion_length": 3.042857274413109, "epoch": 0.8735143492124843, "grad_norm": 5.636203662342512, "kl": 14.029296875, "learning_rate": 9.439170164960765e-07, "loss": 0.5618, "reward": 0.0005595829256748175, "reward_std": 0.0008843459886588078, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.565520306239023e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 565 }, { "completion_length": 3.1607144445180895, "epoch": 0.8812445646922408, "grad_norm": 12.360836835637958, "kl": 10.444140625, "learning_rate": 8.325764529785851e-07, "loss": 0.4176, "reward": 0.0005362081604275204, "reward_std": 0.00087594923707357, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -5.902998028091133e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 570 }, { "completion_length": 3.197321581840515, "epoch": 0.8889747801719973, "grad_norm": 13.758201446474159, "kl": 8.670703125, "learning_rate": 7.279364067476247e-07, "loss": 0.3469, "reward": 0.0011318138061554351, "reward_std": 0.0017494691826705378, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -5.8662485130867026e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.001190476305782795, "step": 575 }, { "completion_length": 3.0562501341104507, "epoch": 0.8967049956517538, "grad_norm": 22.686578554817494, "kl": 8.484375, "learning_rate": 6.300733597542086e-07, "loss": 0.3396, "reward": 0.00025256405359641576, "reward_std": 0.00048005678167974966, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.505503085994178e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 580 }, { "completion_length": 3.3812501549720766, "epoch": 0.9044352111315103, "grad_norm": 28.800922258193108, "kl": 14.012109375, "learning_rate": 5.390588406055497e-07, "loss": 0.5609, "reward": 0.00023189172827926542, "reward_std": 0.0005052683128155877, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.572738071355388e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 585 }, { "completion_length": 3.1178572922945023, "epoch": 0.9121654266112668, "grad_norm": 12.038541929412316, "kl": 12.13046875, "learning_rate": 4.549593722844492e-07, "loss": 0.4855, "reward": 0.001126268478135728, "reward_std": 0.0017627521178209093, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.420776590942978e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.001190476305782795, "step": 590 }, { "completion_length": 2.953571543097496, "epoch": 0.9198956420910233, "grad_norm": 1104.6062152570587, "kl": 30.114453125, "learning_rate": 3.77836423527278e-07, "loss": 1.2054, "reward": 0.0002571974996016024, "reward_std": 0.00046844321103201025, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.042155214278864e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.00029761907644569873, "step": 595 }, { "completion_length": 2.9026787012815474, "epoch": 0.9276258575707798, "grad_norm": 32.296049698541445, "kl": 103.203515625, "learning_rate": 3.0774636389618196e-07, "loss": 4.1484, "reward": 0.0008542773547659977, "reward_std": 0.0013046089939347638, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.857989551079299e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0008928572293370962, "step": 600 }, { "epoch": 0.9276258575707798, "eval_completion_length": 2.147916778922081, "eval_kl": 16777223.0, "eval_loss": 710646.5, "eval_reward": -1.3044384274962795e-05, "eval_reward_std": 1.6110884054398866e-05, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -1.3044384161275957e-05, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.0, "eval_runtime": 3.9792, "eval_samples_per_second": 24.879, "eval_steps_per_second": 1.005, "step": 600 }, { "completion_length": 2.700892987847328, "epoch": 0.9353560730505363, "grad_norm": 8.649682220388817, "kl": 9.39921875, "learning_rate": 2.44740422578269e-07, "loss": 0.3761, "reward": 0.0005689239432015824, "reward_std": 0.0008738741811551876, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.6314169258512265e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 605 }, { "completion_length": 2.722321552038193, "epoch": 0.9430862885302927, "grad_norm": 104.28970280124992, "kl": 8.729296875, "learning_rate": 1.8886465094192895e-07, "loss": 0.3491, "reward": -2.6268264444695433e-05, "reward_std": 3.34820041203443e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.6268265314399743e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 610 }, { "completion_length": 2.728571557998657, "epoch": 0.9508165040100492, "grad_norm": 5.9041098265539445, "kl": 7.64765625, "learning_rate": 1.401598888776523e-07, "loss": 0.306, "reward": 0.0011648979705810802, "reward_std": 0.0017095742682567306, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.5578282789240346e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.001190476305782795, "step": 615 }, { "completion_length": 2.960714411735535, "epoch": 0.9585467194898057, "grad_norm": 41.56000577480434, "kl": 8.06171875, "learning_rate": 9.866173494794462e-08, "loss": 0.3225, "reward": 0.0005575129636383735, "reward_std": 0.0008880274283423973, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.772521167064724e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 620 }, { "completion_length": 2.875000125169754, "epoch": 0.9662769349695622, "grad_norm": 39.07685937853193, "kl": 7.258203125, "learning_rate": 6.440052036815081e-08, "loss": 0.2903, "reward": 0.0008559451778481275, "reward_std": 0.0013053342402464807, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.691202451108211e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0008928572293370962, "step": 625 }, { "completion_length": 2.9696429878473283, "epoch": 0.9740071504493187, "grad_norm": 6.690428543206767, "kl": 27.094921875, "learning_rate": 3.7401286837214224e-08, "loss": 1.0869, "reward": 0.0005465759921776225, "reward_std": 0.0009026842066234053, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -4.866211212402049e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 630 }, { "completion_length": 2.7732144206762315, "epoch": 0.9817373659290752, "grad_norm": 295.51578758774014, "kl": 23.775, "learning_rate": 1.7683768234568745e-08, "loss": 0.9507, "reward": -2.310699615577505e-05, "reward_std": 2.9341498116153274e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -2.3106995850241675e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 635 }, { "completion_length": 2.838392987847328, "epoch": 0.9894675814088317, "grad_norm": 211.20880487987466, "kl": 8.240625, "learning_rate": 5.262376196544239e-09, "loss": 0.3298, "reward": 0.0014538712214815063, "reward_std": 0.0021464306717909666, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.4224153665718404e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0014880953822284937, "step": 640 }, { "completion_length": 2.853571555018425, "epoch": 0.9971977968885882, "grad_norm": 876.873875853119, "kl": 18.779296875, "learning_rate": 1.461895828280824e-10, "loss": 0.7514, "reward": 0.0005649905082677264, "reward_std": 0.0008803802551689444, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -3.0247643157110814e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0005952381528913975, "step": 645 }, { "completion_length": 3.2544644474983215, "epoch": 0.9987438399845395, "kl": 7.447265625, "reward": -6.586624478899239e-05, "reward_std": 8.917666181496031e-05, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -6.586624426319077e-05, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.0, "step": 646, "total_flos": 0.0, "train_loss": 27.60521244561175, "train_runtime": 16907.8148, "train_samples_per_second": 4.284, "train_steps_per_second": 0.038 } ], "logging_steps": 5, "max_steps": 646, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }