|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997795900374697, |
|
"eval_steps": 500, |
|
"global_step": 567, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 348.9375, |
|
"completions/max_terminated_length": 348.9375, |
|
"completions/mean_length": 166.421875, |
|
"completions/mean_terminated_length": 166.421875, |
|
"completions/min_length": 46.9375, |
|
"completions/min_terminated_length": 46.9375, |
|
"epoch": 0.001763279700242451, |
|
"grad_norm": 0.9710755373974377, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.1942, |
|
"num_tokens": 147932.0, |
|
"reward": 0.7169940862804651, |
|
"reward_std": 0.2023643054999411, |
|
"rewards/format_reward/mean": 0.6015625, |
|
"rewards/format_reward/std": 0.4860950894653797, |
|
"rewards/qatch_metrics/mean": 0.7311028698459268, |
|
"rewards/qatch_metrics/std": 0.21332682901993394, |
|
"rewards/tag_count_reward/mean": 0.7080078125, |
|
"rewards/tag_count_reward/std": 0.3562574004754424, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 320.859375, |
|
"completions/max_terminated_length": 320.859375, |
|
"completions/mean_length": 157.0234375, |
|
"completions/mean_terminated_length": 157.0234375, |
|
"completions/min_length": 45.984375, |
|
"completions/min_terminated_length": 45.984375, |
|
"epoch": 0.008816398501212255, |
|
"grad_norm": 1.3231567694690758, |
|
"kl": 0.000191517174243927, |
|
"learning_rate": 7.017543859649122e-08, |
|
"loss": -0.3181, |
|
"num_tokens": 670628.0, |
|
"reward": 0.7213839697651565, |
|
"reward_std": 0.15228567429585382, |
|
"rewards/format_reward/mean": 0.5888671875, |
|
"rewards/format_reward/std": 0.4847157197073102, |
|
"rewards/qatch_metrics/mean": 0.7386370480526239, |
|
"rewards/qatch_metrics/std": 0.13745591492079257, |
|
"rewards/tag_count_reward/mean": 0.693115234375, |
|
"rewards/tag_count_reward/std": 0.36212220159359276, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 336.6, |
|
"completions/max_terminated_length": 336.6, |
|
"completions/mean_length": 168.38125, |
|
"completions/mean_terminated_length": 168.38125, |
|
"completions/min_length": 47.6625, |
|
"completions/min_terminated_length": 47.6625, |
|
"epoch": 0.01763279700242451, |
|
"grad_norm": 1.2344979720798859, |
|
"kl": 0.0002501368522644043, |
|
"learning_rate": 1.5789473684210525e-07, |
|
"loss": -0.2443, |
|
"num_tokens": 1345916.0, |
|
"reward": 0.6422883274964988, |
|
"reward_std": 0.19879084336571395, |
|
"rewards/format_reward/mean": 0.596875, |
|
"rewards/format_reward/std": 0.4875950779765844, |
|
"rewards/qatch_metrics/mean": 0.6440179711673408, |
|
"rewards/qatch_metrics/std": 0.19703516885638236, |
|
"rewards/tag_count_reward/mean": 0.7037109375, |
|
"rewards/tag_count_reward/std": 0.3609486103057861, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 334.6375, |
|
"completions/max_terminated_length": 334.6375, |
|
"completions/mean_length": 172.48515625, |
|
"completions/mean_terminated_length": 172.48515625, |
|
"completions/min_length": 47.9625, |
|
"completions/min_terminated_length": 47.9625, |
|
"epoch": 0.026449195503636766, |
|
"grad_norm": 1.0324306738907167, |
|
"kl": 0.0003110170364379883, |
|
"learning_rate": 2.456140350877193e-07, |
|
"loss": -0.2119, |
|
"num_tokens": 2061897.0, |
|
"reward": 0.7045063060708344, |
|
"reward_std": 0.17503634537570179, |
|
"rewards/format_reward/mean": 0.6765625, |
|
"rewards/format_reward/std": 0.4584621708840132, |
|
"rewards/qatch_metrics/mean": 0.7045317724347114, |
|
"rewards/qatch_metrics/std": 0.17522094006126282, |
|
"rewards/tag_count_reward/mean": 0.7599609375, |
|
"rewards/tag_count_reward/std": 0.34046435691416266, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 359.0375, |
|
"completions/max_terminated_length": 359.0375, |
|
"completions/mean_length": 213.50859375, |
|
"completions/mean_terminated_length": 213.50859375, |
|
"completions/min_length": 63.2125, |
|
"completions/min_terminated_length": 63.2125, |
|
"epoch": 0.03526559400484902, |
|
"grad_norm": 0.7271916297615016, |
|
"kl": 0.0008575439453125, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": -0.1464, |
|
"num_tokens": 2779524.0, |
|
"reward": 0.736725780274719, |
|
"reward_std": 0.14974234900437294, |
|
"rewards/format_reward/mean": 0.82265625, |
|
"rewards/format_reward/std": 0.35349783338606355, |
|
"rewards/qatch_metrics/mean": 0.7184940161881969, |
|
"rewards/qatch_metrics/std": 0.14797394773922862, |
|
"rewards/tag_count_reward/mean": 0.8748046875, |
|
"rewards/tag_count_reward/std": 0.2501601692289114, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 345.9, |
|
"completions/max_terminated_length": 345.9, |
|
"completions/mean_length": 221.61953125, |
|
"completions/mean_terminated_length": 221.61953125, |
|
"completions/min_length": 110.9625, |
|
"completions/min_terminated_length": 110.9625, |
|
"epoch": 0.04408199250606128, |
|
"grad_norm": 0.36089994298099687, |
|
"kl": 0.002716684341430664, |
|
"learning_rate": 4.2105263157894733e-07, |
|
"loss": -0.0294, |
|
"num_tokens": 3532397.0, |
|
"reward": 0.7203821750357747, |
|
"reward_std": 0.15227624527178704, |
|
"rewards/format_reward/mean": 0.94375, |
|
"rewards/format_reward/std": 0.16674657054245473, |
|
"rewards/qatch_metrics/mean": 0.6798958362895065, |
|
"rewards/qatch_metrics/std": 0.1698521633632481, |
|
"rewards/tag_count_reward/mean": 0.9619140625, |
|
"rewards/tag_count_reward/std": 0.11083460114896297, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 351.1625, |
|
"completions/max_terminated_length": 351.1625, |
|
"completions/mean_length": 233.1609375, |
|
"completions/mean_terminated_length": 233.1609375, |
|
"completions/min_length": 146.275, |
|
"completions/min_terminated_length": 146.275, |
|
"epoch": 0.05289839100727353, |
|
"grad_norm": 0.39496099506167615, |
|
"kl": 0.002033853530883789, |
|
"learning_rate": 5.087719298245614e-07, |
|
"loss": 0.0164, |
|
"num_tokens": 4292411.0, |
|
"reward": 0.7153215611353516, |
|
"reward_std": 0.15457577785709872, |
|
"rewards/format_reward/mean": 0.98828125, |
|
"rewards/format_reward/std": 0.046875, |
|
"rewards/qatch_metrics/mean": 0.6668190175667406, |
|
"rewards/qatch_metrics/std": 0.1794413580093533, |
|
"rewards/tag_count_reward/mean": 0.9939453125, |
|
"rewards/tag_count_reward/std": 0.02421875, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 357.1375, |
|
"completions/max_terminated_length": 357.1375, |
|
"completions/mean_length": 236.98046875, |
|
"completions/mean_terminated_length": 236.98046875, |
|
"completions/min_length": 144.2375, |
|
"completions/min_terminated_length": 144.2375, |
|
"epoch": 0.06171478950848578, |
|
"grad_norm": 0.40335285901379214, |
|
"kl": 0.004096126556396485, |
|
"learning_rate": 5.964912280701754e-07, |
|
"loss": -0.0084, |
|
"num_tokens": 5094210.0, |
|
"reward": 0.7295356256887316, |
|
"reward_std": 0.15927550423293724, |
|
"rewards/format_reward/mean": 0.99296875, |
|
"rewards/format_reward/std": 0.028125, |
|
"rewards/qatch_metrics/mean": 0.6829440153203905, |
|
"rewards/qatch_metrics/std": 0.1853304866242979, |
|
"rewards/tag_count_reward/mean": 0.9947265625, |
|
"rewards/tag_count_reward/std": 0.02109375, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 348.875, |
|
"completions/max_terminated_length": 348.875, |
|
"completions/mean_length": 225.73203125, |
|
"completions/mean_terminated_length": 225.73203125, |
|
"completions/min_length": 141.75, |
|
"completions/min_terminated_length": 141.75, |
|
"epoch": 0.07053118800969804, |
|
"grad_norm": 0.28998715845893785, |
|
"kl": 0.001429271697998047, |
|
"learning_rate": 6.842105263157895e-07, |
|
"loss": 0.0087, |
|
"num_tokens": 5833499.0, |
|
"reward": 0.7490684226155281, |
|
"reward_std": 0.12702710015701085, |
|
"rewards/format_reward/mean": 0.9984375, |
|
"rewards/format_reward/std": 0.00625, |
|
"rewards/qatch_metrics/mean": 0.7050161464139819, |
|
"rewards/qatch_metrics/std": 0.14899895801208912, |
|
"rewards/tag_count_reward/mean": 0.99921875, |
|
"rewards/tag_count_reward/std": 0.003125, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 337.7625, |
|
"completions/max_terminated_length": 337.7625, |
|
"completions/mean_length": 234.1765625, |
|
"completions/mean_terminated_length": 234.1765625, |
|
"completions/min_length": 149.025, |
|
"completions/min_terminated_length": 149.025, |
|
"epoch": 0.0793475865109103, |
|
"grad_norm": 0.3515825769486025, |
|
"kl": 0.001779651641845703, |
|
"learning_rate": 7.719298245614034e-07, |
|
"loss": 0.0103, |
|
"num_tokens": 6625773.0, |
|
"reward": 0.7813593098893762, |
|
"reward_std": 0.1548473397095222, |
|
"rewards/format_reward/mean": 0.99765625, |
|
"rewards/format_reward/std": 0.009375, |
|
"rewards/qatch_metrics/mean": 0.7431203166022897, |
|
"rewards/qatch_metrics/std": 0.18166826255619525, |
|
"rewards/tag_count_reward/mean": 0.998828125, |
|
"rewards/tag_count_reward/std": 0.0046875, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 345.325, |
|
"completions/max_terminated_length": 345.325, |
|
"completions/mean_length": 230.6203125, |
|
"completions/mean_terminated_length": 230.6203125, |
|
"completions/min_length": 150.1625, |
|
"completions/min_terminated_length": 150.1625, |
|
"epoch": 0.08816398501212255, |
|
"grad_norm": 0.30438917372160834, |
|
"kl": 0.0018757820129394532, |
|
"learning_rate": 8.596491228070175e-07, |
|
"loss": 0.0006, |
|
"num_tokens": 7364007.0, |
|
"reward": 0.7656656216830016, |
|
"reward_std": 0.1071579195689992, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.724415887008945, |
|
"rewards/qatch_metrics/std": 0.12601496450661215, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 340.725, |
|
"completions/max_terminated_length": 340.725, |
|
"completions/mean_length": 219.0265625, |
|
"completions/mean_terminated_length": 219.0265625, |
|
"completions/min_length": 140.775, |
|
"completions/min_terminated_length": 140.775, |
|
"epoch": 0.09698038351333481, |
|
"grad_norm": 0.32051285299560317, |
|
"kl": 0.002438640594482422, |
|
"learning_rate": 9.473684210526315e-07, |
|
"loss": 0.0062, |
|
"num_tokens": 8093081.0, |
|
"reward": 0.7895074604079128, |
|
"reward_std": 0.10295408805832267, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7524651106446981, |
|
"rewards/qatch_metrics/std": 0.12109476723708212, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 333.6125, |
|
"completions/max_terminated_length": 333.6125, |
|
"completions/mean_length": 214.33203125, |
|
"completions/mean_terminated_length": 214.33203125, |
|
"completions/min_length": 138.125, |
|
"completions/min_terminated_length": 138.125, |
|
"epoch": 0.10579678201454706, |
|
"grad_norm": 0.3687506384245934, |
|
"kl": 0.0034315109252929686, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0167, |
|
"num_tokens": 8839074.0, |
|
"reward": 0.7963492956012488, |
|
"reward_std": 0.13961084922775627, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.76051432879176, |
|
"rewards/qatch_metrics/std": 0.16409257110208272, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 314.45, |
|
"completions/max_terminated_length": 314.45, |
|
"completions/mean_length": 205.66484375, |
|
"completions/mean_terminated_length": 205.66484375, |
|
"completions/min_length": 138.325, |
|
"completions/min_terminated_length": 138.325, |
|
"epoch": 0.11461318051575932, |
|
"grad_norm": 0.3598691865301115, |
|
"kl": 0.004790306091308594, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0072, |
|
"num_tokens": 9592197.0, |
|
"reward": 0.7755136819556355, |
|
"reward_std": 0.12071140363659652, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.735898441291647, |
|
"rewards/qatch_metrics/std": 0.14201342400228895, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 329.4875, |
|
"completions/max_terminated_length": 329.4875, |
|
"completions/mean_length": 218.82734375, |
|
"completions/mean_terminated_length": 218.82734375, |
|
"completions/min_length": 144.225, |
|
"completions/min_terminated_length": 144.225, |
|
"epoch": 0.12342957901697156, |
|
"grad_norm": 0.40718780074992844, |
|
"kl": 0.005454826354980469, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0052, |
|
"num_tokens": 10323832.0, |
|
"reward": 0.8082829523831606, |
|
"reward_std": 0.12197014213888906, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7744505235925316, |
|
"rewards/qatch_metrics/std": 0.1434942939085886, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 336.075, |
|
"completions/max_terminated_length": 336.075, |
|
"completions/mean_length": 220.25234375, |
|
"completions/mean_terminated_length": 220.25234375, |
|
"completions/min_length": 142.125, |
|
"completions/min_terminated_length": 142.125, |
|
"epoch": 0.13224597751818382, |
|
"grad_norm": 0.4066036566641524, |
|
"kl": 0.005206871032714844, |
|
"learning_rate": 1e-06, |
|
"loss": 0.003, |
|
"num_tokens": 11058347.0, |
|
"reward": 0.7646768478676677, |
|
"reward_std": 0.15216110937763005, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7231492260238156, |
|
"rewards/qatch_metrics/std": 0.17901307856664062, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 323.4625, |
|
"completions/max_terminated_length": 323.4625, |
|
"completions/mean_length": 212.896875, |
|
"completions/mean_terminated_length": 212.896875, |
|
"completions/min_length": 138.65, |
|
"completions/min_terminated_length": 138.65, |
|
"epoch": 0.14106237601939609, |
|
"grad_norm": 0.3757999153884869, |
|
"kl": 0.006637382507324219, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0123, |
|
"num_tokens": 11818007.0, |
|
"reward": 0.7600463172420859, |
|
"reward_std": 0.11857278576935641, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7177130273543298, |
|
"rewards/qatch_metrics/std": 0.1394514435902238, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 318.2375, |
|
"completions/max_terminated_length": 318.2375, |
|
"completions/mean_length": 212.54140625, |
|
"completions/mean_terminated_length": 212.54140625, |
|
"completions/min_length": 140.9125, |
|
"completions/min_terminated_length": 140.9125, |
|
"epoch": 0.14987877452060833, |
|
"grad_norm": 0.2843877461846444, |
|
"kl": 0.006991195678710938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0103, |
|
"num_tokens": 12562892.0, |
|
"reward": 0.7912707846611738, |
|
"reward_std": 0.12222371264360846, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7544361999258399, |
|
"rewards/qatch_metrics/std": 0.14379260735586286, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 316.575, |
|
"completions/max_terminated_length": 316.575, |
|
"completions/mean_length": 211.7421875, |
|
"completions/mean_terminated_length": 211.7421875, |
|
"completions/min_length": 143.1125, |
|
"completions/min_terminated_length": 143.1125, |
|
"epoch": 0.1586951730218206, |
|
"grad_norm": 0.33442141566841666, |
|
"kl": 0.00792999267578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.001, |
|
"num_tokens": 13308402.0, |
|
"reward": 0.8190947765484452, |
|
"reward_std": 0.12180806350661441, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7871703177705058, |
|
"rewards/qatch_metrics/std": 0.14330361068132333, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 327.6, |
|
"completions/max_terminated_length": 327.6, |
|
"completions/mean_length": 220.39140625, |
|
"completions/mean_terminated_length": 220.39140625, |
|
"completions/min_length": 144.9375, |
|
"completions/min_terminated_length": 144.9375, |
|
"epoch": 0.16751157152303284, |
|
"grad_norm": 0.2060349260059472, |
|
"kl": 0.007589149475097656, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0053, |
|
"num_tokens": 14090727.0, |
|
"reward": 0.8084983274340629, |
|
"reward_std": 0.09074296336621046, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.774703911319375, |
|
"rewards/qatch_metrics/std": 0.1067564318422228, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 345.4375, |
|
"completions/max_terminated_length": 345.4375, |
|
"completions/mean_length": 233.321875, |
|
"completions/mean_terminated_length": 233.321875, |
|
"completions/min_length": 150.725, |
|
"completions/min_terminated_length": 150.725, |
|
"epoch": 0.1763279700242451, |
|
"grad_norm": 0.3437681507721623, |
|
"kl": 0.008571624755859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 14870899.0, |
|
"reward": 0.7850898955017328, |
|
"reward_std": 0.11421530576772057, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.747267971560359, |
|
"rewards/qatch_metrics/std": 0.13395735200028866, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 328.7, |
|
"completions/max_terminated_length": 328.7, |
|
"completions/mean_length": 218.9609375, |
|
"completions/mean_terminated_length": 218.9609375, |
|
"completions/min_length": 147.7125, |
|
"completions/min_terminated_length": 147.7125, |
|
"epoch": 0.18514436852545735, |
|
"grad_norm": 0.2956106829157968, |
|
"kl": 0.0087432861328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 15637185.0, |
|
"reward": 0.8008848559111357, |
|
"reward_std": 0.1065025228075683, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7657468795776368, |
|
"rewards/qatch_metrics/std": 0.12529709176160395, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 354.7, |
|
"completions/max_terminated_length": 354.7, |
|
"completions/mean_length": 241.70234375, |
|
"completions/mean_terminated_length": 241.70234375, |
|
"completions/min_length": 153.675, |
|
"completions/min_terminated_length": 153.675, |
|
"epoch": 0.19396076702666962, |
|
"grad_norm": 0.3156816970551577, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0043, |
|
"num_tokens": 16427236.0, |
|
"reward": 0.8307245042175054, |
|
"reward_std": 0.12043557950855757, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8008523487951607, |
|
"rewards/qatch_metrics/std": 0.14168892623565627, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 369.025, |
|
"completions/max_terminated_length": 369.025, |
|
"completions/mean_length": 242.18203125, |
|
"completions/mean_terminated_length": 242.18203125, |
|
"completions/min_length": 155.2875, |
|
"completions/min_terminated_length": 155.2875, |
|
"epoch": 0.20277716552788186, |
|
"grad_norm": 0.35044764518300353, |
|
"kl": 0.008629226684570312, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0172, |
|
"num_tokens": 17204669.0, |
|
"reward": 0.7873043244704604, |
|
"reward_std": 0.11700194676523097, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7498731818312081, |
|
"rewards/qatch_metrics/std": 0.13737923657754436, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 326.7, |
|
"completions/max_terminated_length": 326.7, |
|
"completions/mean_length": 223.1546875, |
|
"completions/mean_terminated_length": 223.1546875, |
|
"completions/min_length": 145.8125, |
|
"completions/min_terminated_length": 145.8125, |
|
"epoch": 0.21159356402909413, |
|
"grad_norm": 0.256901537727429, |
|
"kl": 0.008853530883789063, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 17978371.0, |
|
"reward": 0.8254757545888424, |
|
"reward_std": 0.09378883789759129, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.794677343731746, |
|
"rewards/qatch_metrics/std": 0.11033981533837504, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 320.0, |
|
"completions/max_terminated_length": 320.0, |
|
"completions/mean_length": 214.93828125, |
|
"completions/mean_terminated_length": 214.93828125, |
|
"completions/min_length": 144.85, |
|
"completions/min_terminated_length": 144.85, |
|
"epoch": 0.22040996253030637, |
|
"grad_norm": 0.3734181940444686, |
|
"kl": 0.009600830078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"num_tokens": 18723748.0, |
|
"reward": 0.7991562966257334, |
|
"reward_std": 0.09936780408024788, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7637132841162384, |
|
"rewards/qatch_metrics/std": 0.11690330407582224, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 328.3, |
|
"completions/max_terminated_length": 328.3, |
|
"completions/mean_length": 220.58515625, |
|
"completions/mean_terminated_length": 220.58515625, |
|
"completions/min_length": 137.9875, |
|
"completions/min_terminated_length": 137.9875, |
|
"epoch": 0.22922636103151864, |
|
"grad_norm": 0.38207935979690344, |
|
"kl": 0.010348129272460937, |
|
"learning_rate": 1e-06, |
|
"loss": 0.012, |
|
"num_tokens": 19500401.0, |
|
"reward": 0.812997136451304, |
|
"reward_std": 0.12226196355186403, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7799966168124228, |
|
"rewards/qatch_metrics/std": 0.14383760886266828, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 338.6125, |
|
"completions/max_terminated_length": 338.6125, |
|
"completions/mean_length": 220.6390625, |
|
"completions/mean_terminated_length": 220.6390625, |
|
"completions/min_length": 142.0375, |
|
"completions/min_terminated_length": 142.0375, |
|
"epoch": 0.23804275953273088, |
|
"grad_norm": 0.3417278304849786, |
|
"kl": 0.011602020263671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0024, |
|
"num_tokens": 20253139.0, |
|
"reward": 0.7694992732256651, |
|
"reward_std": 0.10531347445212305, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7288226603297516, |
|
"rewards/qatch_metrics/std": 0.12389820874668658, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 312.3, |
|
"completions/max_terminated_length": 312.3, |
|
"completions/mean_length": 208.0296875, |
|
"completions/mean_terminated_length": 208.0296875, |
|
"completions/min_length": 136.55, |
|
"completions/min_terminated_length": 136.55, |
|
"epoch": 0.24685915803394312, |
|
"grad_norm": 0.3425773253691935, |
|
"kl": 0.010589218139648438, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0116, |
|
"num_tokens": 20952281.0, |
|
"reward": 0.8347327882423997, |
|
"reward_std": 0.10723668891005218, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8055679749464616, |
|
"rewards/qatch_metrics/std": 0.1261608180589974, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 336.2375, |
|
"completions/max_terminated_length": 336.2375, |
|
"completions/mean_length": 226.9140625, |
|
"completions/mean_terminated_length": 226.9140625, |
|
"completions/min_length": 141.725, |
|
"completions/min_terminated_length": 141.725, |
|
"epoch": 0.2556755565351554, |
|
"grad_norm": 0.3141154785420489, |
|
"kl": 0.011150741577148437, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0045, |
|
"num_tokens": 21701563.0, |
|
"reward": 0.8115479167550802, |
|
"reward_std": 0.11712859515100718, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.778395052952692, |
|
"rewards/qatch_metrics/std": 0.13778491392731668, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 324.325, |
|
"completions/max_terminated_length": 324.325, |
|
"completions/mean_length": 214.4046875, |
|
"completions/mean_terminated_length": 214.4046875, |
|
"completions/min_length": 137.175, |
|
"completions/min_terminated_length": 137.175, |
|
"epoch": 0.26449195503636763, |
|
"grad_norm": 0.346933040349207, |
|
"kl": 0.020714950561523438, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0045, |
|
"num_tokens": 22407937.0, |
|
"reward": 0.8108294125646353, |
|
"reward_std": 0.08920670928346226, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7774463596753776, |
|
"rewards/qatch_metrics/std": 0.1049490759614855, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 346.025, |
|
"completions/max_terminated_length": 346.025, |
|
"completions/mean_length": 229.28125, |
|
"completions/mean_terminated_length": 229.28125, |
|
"completions/min_length": 140.1375, |
|
"completions/min_terminated_length": 140.1375, |
|
"epoch": 0.2733083535375799, |
|
"grad_norm": 0.34731522656971436, |
|
"kl": 0.011865234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0106, |
|
"num_tokens": 23184281.0, |
|
"reward": 0.8204727115109562, |
|
"reward_std": 0.12891971635399385, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.788791407039389, |
|
"rewards/qatch_metrics/std": 0.15167025988921523, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 318.55, |
|
"completions/max_terminated_length": 318.55, |
|
"completions/mean_length": 209.17734375, |
|
"completions/mean_terminated_length": 209.17734375, |
|
"completions/min_length": 137.3625, |
|
"completions/min_terminated_length": 137.3625, |
|
"epoch": 0.28212475203879217, |
|
"grad_norm": 0.3719362212745786, |
|
"kl": 0.011676025390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0151, |
|
"num_tokens": 23893036.0, |
|
"reward": 0.7991067057475447, |
|
"reward_std": 0.09515065372979734, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.763758339243941, |
|
"rewards/qatch_metrics/std": 0.1118375029604067, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 303.1375, |
|
"completions/max_terminated_length": 303.1375, |
|
"completions/mean_length": 201.36484375, |
|
"completions/mean_terminated_length": 201.36484375, |
|
"completions/min_length": 133.0625, |
|
"completions/min_terminated_length": 133.0625, |
|
"epoch": 0.2909411505400044, |
|
"grad_norm": 0.41140648881811986, |
|
"kl": 0.014212799072265626, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0031, |
|
"num_tokens": 24627135.0, |
|
"reward": 0.7533468393608928, |
|
"reward_std": 0.10053728537168354, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7098197954706847, |
|
"rewards/qatch_metrics/std": 0.11827916505280882, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 308.0, |
|
"completions/max_terminated_length": 308.0, |
|
"completions/mean_length": 203.5234375, |
|
"completions/mean_terminated_length": 203.5234375, |
|
"completions/min_length": 131.275, |
|
"completions/min_terminated_length": 131.275, |
|
"epoch": 0.29975754904121665, |
|
"grad_norm": 0.40595987740034806, |
|
"kl": 0.01388092041015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"num_tokens": 25368413.0, |
|
"reward": 0.7701841354370117, |
|
"reward_std": 0.09663771950872616, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7296283877098176, |
|
"rewards/qatch_metrics/std": 0.11369143746051122, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 328.725, |
|
"completions/max_terminated_length": 328.725, |
|
"completions/mean_length": 216.08125, |
|
"completions/mean_terminated_length": 216.08125, |
|
"completions/min_length": 138.325, |
|
"completions/min_terminated_length": 138.325, |
|
"epoch": 0.3085739475424289, |
|
"grad_norm": 0.25083299753484695, |
|
"kl": 0.01239776611328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"num_tokens": 26110613.0, |
|
"reward": 0.8106268728151917, |
|
"reward_std": 0.10756922718137503, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7772080772556365, |
|
"rewards/qatch_metrics/std": 0.12655203738249837, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 343.4625, |
|
"completions/max_terminated_length": 343.4625, |
|
"completions/mean_length": 231.24453125, |
|
"completions/mean_terminated_length": 231.24453125, |
|
"completions/min_length": 149.375, |
|
"completions/min_terminated_length": 149.375, |
|
"epoch": 0.3173903460436412, |
|
"grad_norm": 0.3299647961730041, |
|
"kl": 0.012017822265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0011, |
|
"num_tokens": 26876590.0, |
|
"reward": 0.7713820965960622, |
|
"reward_std": 0.11187393190339208, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7311411482747644, |
|
"rewards/qatch_metrics/std": 0.13165212250314653, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 353.5875, |
|
"completions/max_terminated_length": 353.5875, |
|
"completions/mean_length": 242.12734375, |
|
"completions/mean_terminated_length": 242.12734375, |
|
"completions/min_length": 155.3125, |
|
"completions/min_terminated_length": 155.3125, |
|
"epoch": 0.32620674454485343, |
|
"grad_norm": 0.32558550372348305, |
|
"kl": 0.013112258911132813, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0031, |
|
"num_tokens": 27658417.0, |
|
"reward": 0.8029140060767531, |
|
"reward_std": 0.10181500271428376, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7681341207586229, |
|
"rewards/qatch_metrics/std": 0.1197823622263968, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 358.3625, |
|
"completions/max_terminated_length": 358.3625, |
|
"completions/mean_length": 244.378125, |
|
"completions/mean_terminated_length": 244.378125, |
|
"completions/min_length": 150.175, |
|
"completions/min_terminated_length": 150.175, |
|
"epoch": 0.3350231430460657, |
|
"grad_norm": 0.32369260650095066, |
|
"kl": 0.012503814697265626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0035, |
|
"num_tokens": 28450773.0, |
|
"reward": 0.8509975776076317, |
|
"reward_std": 0.10326635130477371, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.8248408894985915, |
|
"rewards/qatch_metrics/std": 0.12098856130687637, |
|
"rewards/tag_count_reward/mean": 0.99921875, |
|
"rewards/tag_count_reward/std": 0.003125, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 346.8, |
|
"completions/max_terminated_length": 346.8, |
|
"completions/mean_length": 229.31484375, |
|
"completions/mean_terminated_length": 229.31484375, |
|
"completions/min_length": 142.2, |
|
"completions/min_terminated_length": 142.2, |
|
"epoch": 0.3438395415472779, |
|
"grad_norm": 0.33109392130180526, |
|
"kl": 0.012652969360351563, |
|
"learning_rate": 1e-06, |
|
"loss": 0.005, |
|
"num_tokens": 29193720.0, |
|
"reward": 0.8215816805139184, |
|
"reward_std": 0.10861785978777334, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7901994839310647, |
|
"rewards/qatch_metrics/std": 0.12737212259089575, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 332.9, |
|
"completions/max_terminated_length": 332.9, |
|
"completions/mean_length": 226.34296875, |
|
"completions/mean_terminated_length": 226.34296875, |
|
"completions/min_length": 144.8, |
|
"completions/min_terminated_length": 144.8, |
|
"epoch": 0.3526559400484902, |
|
"grad_norm": 0.34898255755360424, |
|
"kl": 0.0120391845703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.001, |
|
"num_tokens": 29936351.0, |
|
"reward": 0.8118854926899075, |
|
"reward_std": 0.08703446270665154, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.778688807785511, |
|
"rewards/qatch_metrics/std": 0.10239349115872756, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 341.4875, |
|
"completions/max_terminated_length": 341.4875, |
|
"completions/mean_length": 233.30625, |
|
"completions/mean_terminated_length": 233.30625, |
|
"completions/min_length": 146.125, |
|
"completions/min_terminated_length": 146.125, |
|
"epoch": 0.36147233854970245, |
|
"grad_norm": 0.2847757548262052, |
|
"kl": 0.013054656982421874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 30723207.0, |
|
"reward": 0.8451463960111141, |
|
"reward_std": 0.07684196562040597, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8178192753344774, |
|
"rewards/qatch_metrics/std": 0.09040231805993244, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 344.9125, |
|
"completions/max_terminated_length": 344.9125, |
|
"completions/mean_length": 233.31953125, |
|
"completions/mean_terminated_length": 233.31953125, |
|
"completions/min_length": 144.9875, |
|
"completions/min_terminated_length": 144.9875, |
|
"epoch": 0.3702887370509147, |
|
"grad_norm": 0.4034177578209816, |
|
"kl": 0.014620590209960937, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0075, |
|
"num_tokens": 31508960.0, |
|
"reward": 0.7541328657418489, |
|
"reward_std": 0.10346053875982761, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7107445362955331, |
|
"rewards/qatch_metrics/std": 0.12171828672289849, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 337.2125, |
|
"completions/max_terminated_length": 337.2125, |
|
"completions/mean_length": 229.95234375, |
|
"completions/mean_terminated_length": 229.95234375, |
|
"completions/min_length": 143.85, |
|
"completions/min_terminated_length": 143.85, |
|
"epoch": 0.37910513555212694, |
|
"grad_norm": 0.3513936634634679, |
|
"kl": 0.01492767333984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0053, |
|
"num_tokens": 32267923.0, |
|
"reward": 0.7756918715313077, |
|
"reward_std": 0.08237711414694786, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7361080780159682, |
|
"rewards/qatch_metrics/std": 0.09691425783094018, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 331.65, |
|
"completions/max_terminated_length": 331.65, |
|
"completions/mean_length": 225.4984375, |
|
"completions/mean_terminated_length": 225.4984375, |
|
"completions/min_length": 145.2, |
|
"completions/min_terminated_length": 145.2, |
|
"epoch": 0.38792153405333923, |
|
"grad_norm": 0.2816074681768635, |
|
"kl": 0.0148773193359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0012, |
|
"num_tokens": 33038177.0, |
|
"reward": 0.8032938608899712, |
|
"reward_std": 0.09598501106374897, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7685809925198555, |
|
"rewards/qatch_metrics/std": 0.11292354888282716, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 316.7, |
|
"completions/max_terminated_length": 316.7, |
|
"completions/mean_length": 215.88984375, |
|
"completions/mean_terminated_length": 215.88984375, |
|
"completions/min_length": 138.5125, |
|
"completions/min_terminated_length": 138.5125, |
|
"epoch": 0.3967379325545515, |
|
"grad_norm": 0.2792708523998877, |
|
"kl": 0.01438140869140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0031, |
|
"num_tokens": 33775780.0, |
|
"reward": 0.833816378749907, |
|
"reward_std": 0.08319322268362157, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8044898479929543, |
|
"rewards/qatch_metrics/std": 0.09787438737985213, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 321.1625, |
|
"completions/max_terminated_length": 321.1625, |
|
"completions/mean_length": 212.334375, |
|
"completions/mean_terminated_length": 212.334375, |
|
"completions/min_length": 140.6, |
|
"completions/min_terminated_length": 140.6, |
|
"epoch": 0.4055543310557637, |
|
"grad_norm": 0.2700162867921189, |
|
"kl": 0.015807723999023436, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0029, |
|
"num_tokens": 34518032.0, |
|
"reward": 0.7357370050624013, |
|
"reward_std": 0.10150592336431145, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.6891023456788389, |
|
"rewards/qatch_metrics/std": 0.11941873789764941, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 325.4625, |
|
"completions/max_terminated_length": 325.4625, |
|
"completions/mean_length": 218.98125, |
|
"completions/mean_terminated_length": 218.98125, |
|
"completions/min_length": 143.225, |
|
"completions/min_terminated_length": 143.225, |
|
"epoch": 0.41437072955697596, |
|
"grad_norm": 0.41442176380867557, |
|
"kl": 0.014601898193359376, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0086, |
|
"num_tokens": 35259240.0, |
|
"reward": 0.7820553651079536, |
|
"reward_std": 0.112267074175179, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.743594534881413, |
|
"rewards/qatch_metrics/std": 0.13207891816273332, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 319.25, |
|
"completions/max_terminated_length": 319.25, |
|
"completions/mean_length": 209.9296875, |
|
"completions/mean_terminated_length": 209.9296875, |
|
"completions/min_length": 137.3125, |
|
"completions/min_terminated_length": 137.3125, |
|
"epoch": 0.42318712805818826, |
|
"grad_norm": 0.3476187294524526, |
|
"kl": 0.014452362060546875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.002, |
|
"num_tokens": 35975326.0, |
|
"reward": 0.7921670457348228, |
|
"reward_std": 0.09589202178794949, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.755490631237626, |
|
"rewards/qatch_metrics/std": 0.1128141468463582, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 330.4, |
|
"completions/max_terminated_length": 330.4, |
|
"completions/mean_length": 219.6890625, |
|
"completions/mean_terminated_length": 219.6890625, |
|
"completions/min_length": 139.175, |
|
"completions/min_terminated_length": 139.175, |
|
"epoch": 0.4320035265594005, |
|
"grad_norm": 0.24104253998662034, |
|
"kl": 0.014841079711914062, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0048, |
|
"num_tokens": 36727824.0, |
|
"reward": 0.799939620308578, |
|
"reward_std": 0.10737682661347207, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7646578153668088, |
|
"rewards/qatch_metrics/std": 0.1263537659888243, |
|
"rewards/tag_count_reward/mean": 0.999609375, |
|
"rewards/tag_count_reward/std": 0.0015625, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 327.9375, |
|
"completions/max_terminated_length": 327.9375, |
|
"completions/mean_length": 218.1921875, |
|
"completions/mean_terminated_length": 218.1921875, |
|
"completions/min_length": 140.15, |
|
"completions/min_terminated_length": 140.15, |
|
"epoch": 0.44081992506061274, |
|
"grad_norm": 0.2539246494683978, |
|
"kl": 0.014622879028320313, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0025, |
|
"num_tokens": 37470134.0, |
|
"reward": 0.7929010545834899, |
|
"reward_std": 0.08832431975752116, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7563541710143908, |
|
"rewards/qatch_metrics/std": 0.10391096852254122, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 369.9625, |
|
"completions/max_terminated_length": 369.9625, |
|
"completions/mean_length": 228.9, |
|
"completions/mean_terminated_length": 228.9, |
|
"completions/min_length": 142.2125, |
|
"completions/min_terminated_length": 142.2125, |
|
"epoch": 0.449636323561825, |
|
"grad_norm": 0.389718556100374, |
|
"kl": 0.013026809692382813, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0146, |
|
"num_tokens": 38224342.0, |
|
"reward": 0.8210916120558978, |
|
"reward_std": 0.08054333752952517, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7895195348886773, |
|
"rewards/qatch_metrics/std": 0.0947568719740957, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 360.125, |
|
"completions/max_terminated_length": 360.125, |
|
"completions/mean_length": 236.08203125, |
|
"completions/mean_terminated_length": 236.08203125, |
|
"completions/min_length": 150.35, |
|
"completions/min_terminated_length": 150.35, |
|
"epoch": 0.4584527220630373, |
|
"grad_norm": 0.31386840054419024, |
|
"kl": 0.01431121826171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0003, |
|
"num_tokens": 39005375.0, |
|
"reward": 0.7916189678013325, |
|
"reward_std": 0.10870604729279876, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7548458357341588, |
|
"rewards/qatch_metrics/std": 0.12788947536610068, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 361.3, |
|
"completions/max_terminated_length": 361.3, |
|
"completions/mean_length": 241.08046875, |
|
"completions/mean_terminated_length": 241.08046875, |
|
"completions/min_length": 143.625, |
|
"completions/min_terminated_length": 143.625, |
|
"epoch": 0.4672691205642495, |
|
"grad_norm": 0.32199129625786044, |
|
"kl": 0.014544677734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0076, |
|
"num_tokens": 39806326.0, |
|
"reward": 0.8503130197525024, |
|
"reward_std": 0.08854990721156356, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8238976599182933, |
|
"rewards/qatch_metrics/std": 0.10417636758938897, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 357.9125, |
|
"completions/max_terminated_length": 357.9125, |
|
"completions/mean_length": 238.00703125, |
|
"completions/mean_terminated_length": 238.00703125, |
|
"completions/min_length": 150.7125, |
|
"completions/min_terminated_length": 150.7125, |
|
"epoch": 0.47608551906546176, |
|
"grad_norm": 0.24307758730384177, |
|
"kl": 0.0154083251953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0007, |
|
"num_tokens": 40579551.0, |
|
"reward": 0.7649708043783903, |
|
"reward_std": 0.09011161667294801, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7234950542595471, |
|
"rewards/qatch_metrics/std": 0.1060136711690575, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 329.375, |
|
"completions/max_terminated_length": 329.375, |
|
"completions/mean_length": 218.865625, |
|
"completions/mean_terminated_length": 218.865625, |
|
"completions/min_length": 136.9375, |
|
"completions/min_terminated_length": 136.9375, |
|
"epoch": 0.484901917566674, |
|
"grad_norm": 0.2437912279556105, |
|
"kl": 0.01680908203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0116, |
|
"num_tokens": 41304819.0, |
|
"reward": 0.7731978652998805, |
|
"reward_std": 0.09575790755916387, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7332773502916098, |
|
"rewards/qatch_metrics/std": 0.11264292849227786, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 297.425, |
|
"completions/max_terminated_length": 297.425, |
|
"completions/mean_length": 194.35234375, |
|
"completions/mean_terminated_length": 194.35234375, |
|
"completions/min_length": 129.725, |
|
"completions/min_terminated_length": 129.725, |
|
"epoch": 0.49371831606788624, |
|
"grad_norm": 0.3022726877012163, |
|
"kl": 0.017235565185546874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0071, |
|
"num_tokens": 41986038.0, |
|
"reward": 0.8300179397687316, |
|
"reward_std": 0.08664830076304497, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8000210979953408, |
|
"rewards/qatch_metrics/std": 0.10193918244767701, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 276.7875, |
|
"completions/max_terminated_length": 276.7875, |
|
"completions/mean_length": 183.3453125, |
|
"completions/mean_terminated_length": 183.3453125, |
|
"completions/min_length": 127.55, |
|
"completions/min_terminated_length": 127.55, |
|
"epoch": 0.5025347145690985, |
|
"grad_norm": 0.22342235477555344, |
|
"kl": 0.018723297119140624, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0009, |
|
"num_tokens": 42654592.0, |
|
"reward": 0.8566583547741174, |
|
"reward_std": 0.08069641448455514, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8313627634197474, |
|
"rewards/qatch_metrics/std": 0.09493696391291451, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 331.125, |
|
"completions/max_terminated_length": 331.125, |
|
"completions/mean_length": 220.7734375, |
|
"completions/mean_terminated_length": 220.7734375, |
|
"completions/min_length": 140.35, |
|
"completions/min_terminated_length": 140.35, |
|
"epoch": 0.5113511130703108, |
|
"grad_norm": 0.34607047517160744, |
|
"kl": 0.01548309326171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.006, |
|
"num_tokens": 43404558.0, |
|
"reward": 0.8139310251921416, |
|
"reward_std": 0.0980742353014648, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7810953183798119, |
|
"rewards/qatch_metrics/std": 0.11538145933300256, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 372.3625, |
|
"completions/max_terminated_length": 372.3625, |
|
"completions/mean_length": 253.30859375, |
|
"completions/mean_terminated_length": 253.30859375, |
|
"completions/min_length": 162.3375, |
|
"completions/min_terminated_length": 162.3375, |
|
"epoch": 0.5201675115715231, |
|
"grad_norm": 0.26233887817165846, |
|
"kl": 0.017038726806640626, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0014, |
|
"num_tokens": 44202393.0, |
|
"reward": 0.7521807491779328, |
|
"reward_std": 0.0808277386619011, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7084479197394102, |
|
"rewards/qatch_metrics/std": 0.09509145880147116, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 371.2875, |
|
"completions/max_terminated_length": 371.2875, |
|
"completions/mean_length": 256.74375, |
|
"completions/mean_terminated_length": 256.74375, |
|
"completions/min_length": 164.575, |
|
"completions/min_terminated_length": 164.575, |
|
"epoch": 0.5289839100727353, |
|
"grad_norm": 0.27528750385577455, |
|
"kl": 0.0166046142578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0034, |
|
"num_tokens": 44996449.0, |
|
"reward": 0.7653900509700179, |
|
"reward_std": 0.10405777737032622, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7239882865447725, |
|
"rewards/qatch_metrics/std": 0.12242091761436313, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 383.75, |
|
"completions/max_terminated_length": 383.75, |
|
"completions/mean_length": 262.81875, |
|
"completions/mean_terminated_length": 262.81875, |
|
"completions/min_length": 163.2, |
|
"completions/min_terminated_length": 163.2, |
|
"epoch": 0.5378003085739476, |
|
"grad_norm": 0.2916124436851988, |
|
"kl": 0.016686248779296874, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0091, |
|
"num_tokens": 45810553.0, |
|
"reward": 0.751503630913794, |
|
"reward_std": 0.10126840746961534, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7076513038016856, |
|
"rewards/qatch_metrics/std": 0.1191393076442182, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 385.4, |
|
"completions/max_terminated_length": 385.4, |
|
"completions/mean_length": 255.634375, |
|
"completions/mean_terminated_length": 255.634375, |
|
"completions/min_length": 157.3125, |
|
"completions/min_terminated_length": 157.3125, |
|
"epoch": 0.5466167070751597, |
|
"grad_norm": 0.3015279724579697, |
|
"kl": 0.017456817626953124, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0074, |
|
"num_tokens": 46626773.0, |
|
"reward": 0.7629254685714841, |
|
"reward_std": 0.14258774023037404, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.721100265783025, |
|
"rewards/qatch_metrics/std": 0.16777858033310622, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 331.5375, |
|
"completions/max_terminated_length": 331.5375, |
|
"completions/mean_length": 217.5625, |
|
"completions/mean_terminated_length": 217.5625, |
|
"completions/min_length": 134.5375, |
|
"completions/min_terminated_length": 134.5375, |
|
"epoch": 0.555433105576372, |
|
"grad_norm": 0.2736624916091261, |
|
"kl": 0.016427993774414062, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 47337797.0, |
|
"reward": 0.8417512066662312, |
|
"reward_std": 0.06132402071962133, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8138479188084602, |
|
"rewards/qatch_metrics/std": 0.07210502420784906, |
|
"rewards/tag_count_reward/mean": 0.999609375, |
|
"rewards/tag_count_reward/std": 0.0015625, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 347.9625, |
|
"completions/max_terminated_length": 347.9625, |
|
"completions/mean_length": 225.61484375, |
|
"completions/mean_terminated_length": 225.61484375, |
|
"completions/min_length": 139.5625, |
|
"completions/min_terminated_length": 139.5625, |
|
"epoch": 0.5642495040775843, |
|
"grad_norm": 0.3231991864642648, |
|
"kl": 0.017105865478515624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 48108472.0, |
|
"reward": 0.8285636451095343, |
|
"reward_std": 0.09632078433714923, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7983101591467857, |
|
"rewards/qatch_metrics/std": 0.11331857727491297, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 347.45, |
|
"completions/max_terminated_length": 347.45, |
|
"completions/mean_length": 231.76328125, |
|
"completions/mean_terminated_length": 231.76328125, |
|
"completions/min_length": 148.275, |
|
"completions/min_terminated_length": 148.275, |
|
"epoch": 0.5730659025787965, |
|
"grad_norm": 0.2973174138867235, |
|
"kl": 0.016844940185546876, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0095, |
|
"num_tokens": 48899481.0, |
|
"reward": 0.7851231107488275, |
|
"reward_std": 0.09552590373905331, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7472036501319508, |
|
"rewards/qatch_metrics/std": 0.11238342153937993, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 359.275, |
|
"completions/max_terminated_length": 359.275, |
|
"completions/mean_length": 234.6609375, |
|
"completions/mean_terminated_length": 234.6609375, |
|
"completions/min_length": 146.075, |
|
"completions/min_terminated_length": 146.075, |
|
"epoch": 0.5818823010800088, |
|
"grad_norm": 0.3288208170160159, |
|
"kl": 0.017102813720703124, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0158, |
|
"num_tokens": 49645255.0, |
|
"reward": 0.8200173564255238, |
|
"reward_std": 0.09870836285626865, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7882671934901737, |
|
"rewards/qatch_metrics/std": 0.11608153889974346, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 357.4375, |
|
"completions/max_terminated_length": 357.4375, |
|
"completions/mean_length": 230.21484375, |
|
"completions/mean_terminated_length": 230.21484375, |
|
"completions/min_length": 140.6625, |
|
"completions/min_terminated_length": 140.6625, |
|
"epoch": 0.5906986995812211, |
|
"grad_norm": 0.4614331608564361, |
|
"kl": 0.018951416015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0084, |
|
"num_tokens": 50437466.0, |
|
"reward": 0.7783582922071218, |
|
"reward_std": 0.09422233710065483, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7393484384519979, |
|
"rewards/qatch_metrics/std": 0.1104362107347697, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 324.8875, |
|
"completions/max_terminated_length": 324.8875, |
|
"completions/mean_length": 209.18125, |
|
"completions/mean_terminated_length": 209.18125, |
|
"completions/min_length": 134.8375, |
|
"completions/min_terminated_length": 134.8375, |
|
"epoch": 0.5995150980824333, |
|
"grad_norm": 0.2725327541303037, |
|
"kl": 0.019439697265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0057, |
|
"num_tokens": 51167586.0, |
|
"reward": 0.8479114411398768, |
|
"reward_std": 0.07958720491733402, |
|
"rewards/format_reward/mean": 0.9984375, |
|
"rewards/format_reward/std": 0.00625, |
|
"rewards/qatch_metrics/mean": 0.8213135461322963, |
|
"rewards/qatch_metrics/std": 0.0926669392734766, |
|
"rewards/tag_count_reward/mean": 0.9990234375, |
|
"rewards/tag_count_reward/std": 0.00390625, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 345.0833333333333, |
|
"completions/max_terminated_length": 345.0833333333333, |
|
"completions/mean_length": 229.90755208333334, |
|
"completions/mean_terminated_length": 229.90755208333334, |
|
"completions/min_length": 146.02083333333334, |
|
"completions/min_terminated_length": 146.02083333333334, |
|
"epoch": 0.6083314965836456, |
|
"grad_norm": 0.33113384827355125, |
|
"kl": 0.019663492838541668, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0054, |
|
"num_tokens": 51884877.0, |
|
"reward": 0.7889465639988581, |
|
"reward_std": 0.09934755021822639, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7517018277818958, |
|
"rewards/qatch_metrics/std": 0.11687947452689211, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 336.4125, |
|
"completions/max_terminated_length": 336.4125, |
|
"completions/mean_length": 225.73203125, |
|
"completions/mean_terminated_length": 225.73203125, |
|
"completions/min_length": 142.075, |
|
"completions/min_terminated_length": 142.075, |
|
"epoch": 0.6171478950848578, |
|
"grad_norm": 0.3566644104382955, |
|
"kl": 0.020180511474609374, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0094, |
|
"num_tokens": 52662630.0, |
|
"reward": 0.8238930691033601, |
|
"reward_std": 0.08169953491305933, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7928153664804996, |
|
"rewards/qatch_metrics/std": 0.09611710296012461, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 343.2, |
|
"completions/max_terminated_length": 343.2, |
|
"completions/mean_length": 236.98046875, |
|
"completions/mean_terminated_length": 236.98046875, |
|
"completions/min_length": 150.9, |
|
"completions/min_terminated_length": 150.9, |
|
"epoch": 0.6259642935860701, |
|
"grad_norm": 0.28516393114431005, |
|
"kl": 0.017165374755859376, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 53419117.0, |
|
"reward": 0.8083362869918347, |
|
"reward_std": 0.0969126635696739, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7746166706085205, |
|
"rewards/qatch_metrics/std": 0.11360130067914724, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 352.6875, |
|
"completions/max_terminated_length": 352.6875, |
|
"completions/mean_length": 239.6953125, |
|
"completions/mean_terminated_length": 239.6953125, |
|
"completions/min_length": 146.55, |
|
"completions/min_terminated_length": 146.55, |
|
"epoch": 0.6347806920872824, |
|
"grad_norm": 0.17994755171571641, |
|
"kl": 0.017398834228515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"num_tokens": 54185031.0, |
|
"reward": 0.8027490990236401, |
|
"reward_std": 0.05896135854927707, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7679401046669228, |
|
"rewards/qatch_metrics/std": 0.06936630747477465, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 352.7125, |
|
"completions/max_terminated_length": 352.7125, |
|
"completions/mean_length": 239.52109375, |
|
"completions/mean_terminated_length": 239.52109375, |
|
"completions/min_length": 145.0125, |
|
"completions/min_terminated_length": 145.0125, |
|
"epoch": 0.6435970905884946, |
|
"grad_norm": 0.23679998463114285, |
|
"kl": 0.017606353759765624, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0013, |
|
"num_tokens": 54936114.0, |
|
"reward": 0.8317079545930028, |
|
"reward_std": 0.08216131356894038, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8020208382047713, |
|
"rewards/qatch_metrics/std": 0.0966144205071032, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 368.05, |
|
"completions/max_terminated_length": 368.05, |
|
"completions/mean_length": 253.25546875, |
|
"completions/mean_terminated_length": 253.25546875, |
|
"completions/min_length": 161.875, |
|
"completions/min_terminated_length": 161.875, |
|
"epoch": 0.6524134890897069, |
|
"grad_norm": 0.2556963595685909, |
|
"kl": 0.018042755126953126, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0036, |
|
"num_tokens": 55728841.0, |
|
"reward": 0.8250438664108515, |
|
"reward_std": 0.08220216338286264, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7941807355555284, |
|
"rewards/qatch_metrics/std": 0.09668861866011866, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 341.6625, |
|
"completions/max_terminated_length": 341.6625, |
|
"completions/mean_length": 237.73046875, |
|
"completions/mean_terminated_length": 237.73046875, |
|
"completions/min_length": 150.525, |
|
"completions/min_terminated_length": 150.525, |
|
"epoch": 0.6612298875909192, |
|
"grad_norm": 0.2751968612090182, |
|
"kl": 0.018801116943359376, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0012, |
|
"num_tokens": 56499232.0, |
|
"reward": 0.8543739832937718, |
|
"reward_std": 0.09150902703240718, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.828675264492631, |
|
"rewards/qatch_metrics/std": 0.10765768758892591, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 319.2875, |
|
"completions/max_terminated_length": 319.2875, |
|
"completions/mean_length": 219.77734375, |
|
"completions/mean_terminated_length": 219.77734375, |
|
"completions/min_length": 137.475, |
|
"completions/min_terminated_length": 137.475, |
|
"epoch": 0.6700462860921313, |
|
"grad_norm": 0.31154862475726136, |
|
"kl": 0.01962738037109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 57237059.0, |
|
"reward": 0.8459062715992332, |
|
"reward_std": 0.08771184119395911, |
|
"rewards/format_reward/mean": 0.9984375, |
|
"rewards/format_reward/std": 0.00625, |
|
"rewards/qatch_metrics/mean": 0.8189200535358395, |
|
"rewards/qatch_metrics/std": 0.1028526050504297, |
|
"rewards/tag_count_reward/mean": 0.999609375, |
|
"rewards/tag_count_reward/std": 0.0015625, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 302.25, |
|
"completions/max_terminated_length": 302.25, |
|
"completions/mean_length": 203.378125, |
|
"completions/mean_terminated_length": 203.378125, |
|
"completions/min_length": 137.75, |
|
"completions/min_terminated_length": 137.75, |
|
"epoch": 0.6788626845933436, |
|
"grad_norm": 0.2352132785923824, |
|
"kl": 0.02192230224609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0049, |
|
"num_tokens": 57972807.0, |
|
"reward": 0.82975386492908, |
|
"reward_std": 0.06049516258062795, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7997104193782434, |
|
"rewards/qatch_metrics/std": 0.07117078317096456, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 304.5125, |
|
"completions/max_terminated_length": 304.5125, |
|
"completions/mean_length": 204.80234375, |
|
"completions/mean_terminated_length": 204.80234375, |
|
"completions/min_length": 136.125, |
|
"completions/min_terminated_length": 136.125, |
|
"epoch": 0.6876790830945558, |
|
"grad_norm": 0.27452609767604863, |
|
"kl": 0.022785186767578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0009, |
|
"num_tokens": 58735722.0, |
|
"reward": 0.821035155840218, |
|
"reward_std": 0.09460214762948453, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7895565141923726, |
|
"rewards/qatch_metrics/std": 0.11137145487591624, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 298.75, |
|
"completions/max_terminated_length": 298.75, |
|
"completions/mean_length": 199.359375, |
|
"completions/mean_terminated_length": 199.359375, |
|
"completions/min_length": 133.325, |
|
"completions/min_terminated_length": 133.325, |
|
"epoch": 0.6964954815957681, |
|
"grad_norm": 0.36379488564541623, |
|
"kl": 0.023514556884765624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0139, |
|
"num_tokens": 59453574.0, |
|
"reward": 0.7852070070803165, |
|
"reward_std": 0.11039148237323389, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7473023503087461, |
|
"rewards/qatch_metrics/std": 0.12987233807798476, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 297.525, |
|
"completions/max_terminated_length": 297.525, |
|
"completions/mean_length": 192.63515625, |
|
"completions/mean_terminated_length": 192.63515625, |
|
"completions/min_length": 130.4375, |
|
"completions/min_terminated_length": 130.4375, |
|
"epoch": 0.7053118800969804, |
|
"grad_norm": 0.31515335460233057, |
|
"kl": 0.02534027099609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0013, |
|
"num_tokens": 60164595.0, |
|
"reward": 0.8131733348593115, |
|
"reward_std": 0.09474163451232016, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7802039116621018, |
|
"rewards/qatch_metrics/std": 0.11146075297147036, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 301.3375, |
|
"completions/max_terminated_length": 301.3375, |
|
"completions/mean_length": 194.86171875, |
|
"completions/mean_terminated_length": 194.86171875, |
|
"completions/min_length": 132.875, |
|
"completions/min_terminated_length": 132.875, |
|
"epoch": 0.7141282785981926, |
|
"grad_norm": 0.36020622913136857, |
|
"kl": 0.02452239990234375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0044, |
|
"num_tokens": 60876834.0, |
|
"reward": 0.7978186447173357, |
|
"reward_std": 0.12316365442238748, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7622429746668786, |
|
"rewards/qatch_metrics/std": 0.14448482398875057, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 330.2875, |
|
"completions/max_terminated_length": 330.2875, |
|
"completions/mean_length": 222.946875, |
|
"completions/mean_terminated_length": 222.946875, |
|
"completions/min_length": 147.725, |
|
"completions/min_terminated_length": 147.725, |
|
"epoch": 0.7229446770994049, |
|
"grad_norm": 0.23920846142153848, |
|
"kl": 0.022620391845703126, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0015, |
|
"num_tokens": 61628878.0, |
|
"reward": 0.8119752595201135, |
|
"reward_std": 0.09445713473833166, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7788403697311879, |
|
"rewards/qatch_metrics/std": 0.11107292836531997, |
|
"rewards/tag_count_reward/mean": 0.99921875, |
|
"rewards/tag_count_reward/std": 0.003125, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 362.675, |
|
"completions/max_terminated_length": 362.675, |
|
"completions/mean_length": 238.16640625, |
|
"completions/mean_terminated_length": 238.16640625, |
|
"completions/min_length": 149.55, |
|
"completions/min_terminated_length": 149.55, |
|
"epoch": 0.7317610756006171, |
|
"grad_norm": 0.29869110706075575, |
|
"kl": 0.02274169921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0076, |
|
"num_tokens": 62407347.0, |
|
"reward": 0.8343930047005415, |
|
"reward_std": 0.1020151567645371, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.805168230831623, |
|
"rewards/qatch_metrics/std": 0.12001783675514162, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 351.1625, |
|
"completions/max_terminated_length": 351.1625, |
|
"completions/mean_length": 238.10625, |
|
"completions/mean_terminated_length": 238.10625, |
|
"completions/min_length": 151.7, |
|
"completions/min_terminated_length": 151.7, |
|
"epoch": 0.7405774741018294, |
|
"grad_norm": 0.2830612988897095, |
|
"kl": 0.021949005126953126, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 63172059.0, |
|
"reward": 0.82713833283633, |
|
"reward_std": 0.07888575517572463, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7967367230914533, |
|
"rewards/qatch_metrics/std": 0.0928413406247273, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 350.6625, |
|
"completions/max_terminated_length": 350.6625, |
|
"completions/mean_length": 234.71875, |
|
"completions/mean_terminated_length": 234.71875, |
|
"completions/min_length": 152.85, |
|
"completions/min_terminated_length": 152.85, |
|
"epoch": 0.7493938726030417, |
|
"grad_norm": 0.303175281797131, |
|
"kl": 0.023668670654296876, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0014, |
|
"num_tokens": 63958371.0, |
|
"reward": 0.8379335630685091, |
|
"reward_std": 0.08907212568446994, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8093335981713607, |
|
"rewards/qatch_metrics/std": 0.10479074087925255, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 356.15, |
|
"completions/max_terminated_length": 356.15, |
|
"completions/mean_length": 233.284375, |
|
"completions/mean_terminated_length": 233.284375, |
|
"completions/min_length": 144.625, |
|
"completions/min_terminated_length": 144.625, |
|
"epoch": 0.7582102711042539, |
|
"grad_norm": 0.3362377001779979, |
|
"kl": 0.020977020263671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0094, |
|
"num_tokens": 64755503.0, |
|
"reward": 0.7857262346893549, |
|
"reward_std": 0.0779630596237439, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.748039587616222, |
|
"rewards/qatch_metrics/std": 0.09121573810989503, |
|
"rewards/tag_count_reward/mean": 0.9994140625, |
|
"rewards/tag_count_reward/std": 0.00234375, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 342.0875, |
|
"completions/max_terminated_length": 342.0875, |
|
"completions/mean_length": 224.60390625, |
|
"completions/mean_terminated_length": 224.60390625, |
|
"completions/min_length": 139.6875, |
|
"completions/min_terminated_length": 139.6875, |
|
"epoch": 0.7670266696054662, |
|
"grad_norm": 0.27147477635209977, |
|
"kl": 0.023795318603515626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.007, |
|
"num_tokens": 65500036.0, |
|
"reward": 0.8154280468821525, |
|
"reward_std": 0.0885659102234058, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7828565156087279, |
|
"rewards/qatch_metrics/std": 0.10419519691495224, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 343.15, |
|
"completions/max_terminated_length": 343.15, |
|
"completions/mean_length": 225.971875, |
|
"completions/mean_terminated_length": 225.971875, |
|
"completions/min_length": 134.8875, |
|
"completions/min_terminated_length": 134.8875, |
|
"epoch": 0.7758430681066785, |
|
"grad_norm": 0.23493992268435515, |
|
"kl": 0.0210540771484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.001, |
|
"num_tokens": 66252064.0, |
|
"reward": 0.8585389815270901, |
|
"reward_std": 0.08183195294986945, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8335752619896084, |
|
"rewards/qatch_metrics/std": 0.09627289194759214, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 355.575, |
|
"completions/max_terminated_length": 355.575, |
|
"completions/mean_length": 242.10859375, |
|
"completions/mean_terminated_length": 242.10859375, |
|
"completions/min_length": 151.175, |
|
"completions/min_terminated_length": 151.175, |
|
"epoch": 0.7846594666078907, |
|
"grad_norm": 0.2140376472128798, |
|
"kl": 0.02173614501953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 67023275.0, |
|
"reward": 0.87310497071594, |
|
"reward_std": 0.0631409589201212, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8507117207162083, |
|
"rewards/qatch_metrics/std": 0.07428348822286353, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 353.0, |
|
"completions/max_terminated_length": 353.0, |
|
"completions/mean_length": 226.27578125, |
|
"completions/mean_terminated_length": 226.27578125, |
|
"completions/min_length": 139.95, |
|
"completions/min_terminated_length": 139.95, |
|
"epoch": 0.793475865109103, |
|
"grad_norm": 0.3122440505453943, |
|
"kl": 0.021511077880859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 67777324.0, |
|
"reward": 0.8520105175673962, |
|
"reward_std": 0.08210055120289325, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.8260210990672932, |
|
"rewards/qatch_metrics/std": 0.09612429473781958, |
|
"rewards/tag_count_reward/mean": 0.9994140625, |
|
"rewards/tag_count_reward/std": 0.00234375, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 327.425, |
|
"completions/max_terminated_length": 327.425, |
|
"completions/mean_length": 212.48125, |
|
"completions/mean_terminated_length": 212.48125, |
|
"completions/min_length": 135.5125, |
|
"completions/min_terminated_length": 135.5125, |
|
"epoch": 0.8022922636103151, |
|
"grad_norm": 0.3121983142519688, |
|
"kl": 0.022968292236328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0106, |
|
"num_tokens": 68512644.0, |
|
"reward": 0.8407730983570219, |
|
"reward_std": 0.06822409054730087, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8126742199849104, |
|
"rewards/qatch_metrics/std": 0.08026364156394265, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 340.25, |
|
"completions/max_terminated_length": 340.25, |
|
"completions/mean_length": 223.44375, |
|
"completions/mean_terminated_length": 223.44375, |
|
"completions/min_length": 141.9625, |
|
"completions/min_terminated_length": 141.9625, |
|
"epoch": 0.8111086621115274, |
|
"grad_norm": 0.3458299809901387, |
|
"kl": 0.02670440673828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0033, |
|
"num_tokens": 69262636.0, |
|
"reward": 0.8292963147163391, |
|
"reward_std": 0.08730166773311794, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.7992755254730582, |
|
"rewards/qatch_metrics/std": 0.10261548004345969, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 340.2375, |
|
"completions/max_terminated_length": 340.2375, |
|
"completions/mean_length": 223.1640625, |
|
"completions/mean_terminated_length": 223.1640625, |
|
"completions/min_length": 144.9375, |
|
"completions/min_terminated_length": 144.9375, |
|
"epoch": 0.8199250606127397, |
|
"grad_norm": 0.2539455340605116, |
|
"kl": 0.02646636962890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0074, |
|
"num_tokens": 70028046.0, |
|
"reward": 0.782775210775435, |
|
"reward_std": 0.09664202006024425, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7444414100635186, |
|
"rewards/qatch_metrics/std": 0.11369650048291077, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 343.65, |
|
"completions/max_terminated_length": 343.65, |
|
"completions/mean_length": 220.91953125, |
|
"completions/mean_terminated_length": 220.91953125, |
|
"completions/min_length": 142.9125, |
|
"completions/min_terminated_length": 142.9125, |
|
"epoch": 0.8287414591139519, |
|
"grad_norm": 0.29340586589965106, |
|
"kl": 0.023850250244140624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0028, |
|
"num_tokens": 70817127.0, |
|
"reward": 0.83585394769907, |
|
"reward_std": 0.08243265537312254, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8068869862705469, |
|
"rewards/qatch_metrics/std": 0.09697959922486916, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 319.25, |
|
"completions/max_terminated_length": 319.25, |
|
"completions/mean_length": 212.4921875, |
|
"completions/mean_terminated_length": 212.4921875, |
|
"completions/min_length": 137.675, |
|
"completions/min_terminated_length": 137.675, |
|
"epoch": 0.8375578576151642, |
|
"grad_norm": 0.30646618504129836, |
|
"kl": 0.02505950927734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0023, |
|
"num_tokens": 71553357.0, |
|
"reward": 0.8542880930006505, |
|
"reward_std": 0.07209075510618276, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8285742185791605, |
|
"rewards/qatch_metrics/std": 0.08481265990703832, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 342.7375, |
|
"completions/max_terminated_length": 342.7375, |
|
"completions/mean_length": 233.45546875, |
|
"completions/mean_terminated_length": 233.45546875, |
|
"completions/min_length": 147.2125, |
|
"completions/min_terminated_length": 147.2125, |
|
"epoch": 0.8463742561163765, |
|
"grad_norm": 0.3162519376309847, |
|
"kl": 0.0266021728515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0054, |
|
"num_tokens": 72361556.0, |
|
"reward": 0.8315023425966501, |
|
"reward_std": 0.10010532954183873, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8017674525035545, |
|
"rewards/qatch_metrics/std": 0.11777098168386146, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 341.25, |
|
"completions/max_terminated_length": 341.25, |
|
"completions/mean_length": 234.0140625, |
|
"completions/mean_terminated_length": 234.0140625, |
|
"completions/min_length": 154.7, |
|
"completions/min_terminated_length": 154.7, |
|
"epoch": 0.8551906546175887, |
|
"grad_norm": 0.297834941840183, |
|
"kl": 0.02375946044921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0113, |
|
"num_tokens": 73116902.0, |
|
"reward": 0.7931279448792339, |
|
"reward_std": 0.08709526733728126, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7566210999619216, |
|
"rewards/qatch_metrics/std": 0.10246502548689022, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 324.5625, |
|
"completions/max_terminated_length": 324.5625, |
|
"completions/mean_length": 220.03203125, |
|
"completions/mean_terminated_length": 220.03203125, |
|
"completions/min_length": 138.8125, |
|
"completions/min_terminated_length": 138.8125, |
|
"epoch": 0.864007053118801, |
|
"grad_norm": 0.20060310802569065, |
|
"kl": 0.022344970703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0071, |
|
"num_tokens": 73868767.0, |
|
"reward": 0.8470947509631515, |
|
"reward_std": 0.06845701420679688, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8201114618219435, |
|
"rewards/qatch_metrics/std": 0.08053766970988363, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 315.8, |
|
"completions/max_terminated_length": 315.8, |
|
"completions/mean_length": 211.5171875, |
|
"completions/mean_terminated_length": 211.5171875, |
|
"completions/min_length": 139.7625, |
|
"completions/min_terminated_length": 139.7625, |
|
"epoch": 0.8728234516200132, |
|
"grad_norm": 0.2914817614366052, |
|
"kl": 0.023822784423828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0034, |
|
"num_tokens": 74608789.0, |
|
"reward": 0.8249077584594489, |
|
"reward_std": 0.08015898242010736, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7940091195050627, |
|
"rewards/qatch_metrics/std": 0.09430469113285653, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 330.7, |
|
"completions/max_terminated_length": 330.7, |
|
"completions/mean_length": 225.6296875, |
|
"completions/mean_terminated_length": 225.6296875, |
|
"completions/min_length": 147.9125, |
|
"completions/min_terminated_length": 147.9125, |
|
"epoch": 0.8816398501212255, |
|
"grad_norm": 0.27724680534940715, |
|
"kl": 0.024609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0044, |
|
"num_tokens": 75370139.0, |
|
"reward": 0.7901407666504383, |
|
"reward_std": 0.08103454456286271, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7531067751348018, |
|
"rewards/qatch_metrics/std": 0.09533476178003183, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 333.875, |
|
"completions/max_terminated_length": 333.875, |
|
"completions/mean_length": 220.46640625, |
|
"completions/mean_terminated_length": 220.46640625, |
|
"completions/min_length": 142.2, |
|
"completions/min_terminated_length": 142.2, |
|
"epoch": 0.8904562486224378, |
|
"grad_norm": 0.23661752197680214, |
|
"kl": 0.022271728515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0066, |
|
"num_tokens": 76121648.0, |
|
"reward": 0.8500555850565433, |
|
"reward_std": 0.07330139055848121, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8235947942361236, |
|
"rewards/qatch_metrics/std": 0.08623693531844764, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 317.7875, |
|
"completions/max_terminated_length": 317.7875, |
|
"completions/mean_length": 210.9671875, |
|
"completions/mean_terminated_length": 210.9671875, |
|
"completions/min_length": 138.975, |
|
"completions/min_terminated_length": 138.975, |
|
"epoch": 0.89927264712365, |
|
"grad_norm": 0.21851493451825393, |
|
"kl": 0.025273895263671874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0036, |
|
"num_tokens": 76872790.0, |
|
"reward": 0.8583780597895384, |
|
"reward_std": 0.08067860676383135, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8333859436213971, |
|
"rewards/qatch_metrics/std": 0.09491601307672681, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 321.775, |
|
"completions/max_terminated_length": 321.775, |
|
"completions/mean_length": 216.0375, |
|
"completions/mean_terminated_length": 216.0375, |
|
"completions/min_length": 141.1625, |
|
"completions/min_terminated_length": 141.1625, |
|
"epoch": 0.9080890456248623, |
|
"grad_norm": 0.2829422652691582, |
|
"kl": 0.025566864013671874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0082, |
|
"num_tokens": 77616342.0, |
|
"reward": 0.8006692606955766, |
|
"reward_std": 0.08726950597483665, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7654932357370854, |
|
"rewards/qatch_metrics/std": 0.10267001276370138, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 290.275, |
|
"completions/max_terminated_length": 290.275, |
|
"completions/mean_length": 195.296875, |
|
"completions/mean_terminated_length": 195.296875, |
|
"completions/min_length": 130.9125, |
|
"completions/min_terminated_length": 130.9125, |
|
"epoch": 0.9169054441260746, |
|
"grad_norm": 0.2228119611592202, |
|
"kl": 0.023336029052734374, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0065, |
|
"num_tokens": 78313506.0, |
|
"reward": 0.7732636205852031, |
|
"reward_std": 0.05938901338377036, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7332513069733977, |
|
"rewards/qatch_metrics/std": 0.06986943007213994, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 305.6125, |
|
"completions/max_terminated_length": 305.6125, |
|
"completions/mean_length": 205.165625, |
|
"completions/mean_terminated_length": 205.165625, |
|
"completions/min_length": 137.2, |
|
"completions/min_terminated_length": 137.2, |
|
"epoch": 0.9257218426272867, |
|
"grad_norm": 0.36003445762352576, |
|
"kl": 0.023264312744140626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0079, |
|
"num_tokens": 79054486.0, |
|
"reward": 0.8107579160481692, |
|
"reward_std": 0.07336599697882776, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7773622425273061, |
|
"rewards/qatch_metrics/std": 0.08631294009974226, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 323.2125, |
|
"completions/max_terminated_length": 323.2125, |
|
"completions/mean_length": 220.25, |
|
"completions/mean_terminated_length": 220.25, |
|
"completions/min_length": 145.8625, |
|
"completions/min_terminated_length": 145.8625, |
|
"epoch": 0.934538241128499, |
|
"grad_norm": 0.30049379873568555, |
|
"kl": 0.0247039794921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0046, |
|
"num_tokens": 79821798.0, |
|
"reward": 0.8396705185994506, |
|
"reward_std": 0.07967656154651195, |
|
"rewards/format_reward/mean": 0.99921875, |
|
"rewards/format_reward/std": 0.003125, |
|
"rewards/qatch_metrics/mean": 0.811480475217104, |
|
"rewards/qatch_metrics/std": 0.09332353379577399, |
|
"rewards/tag_count_reward/mean": 0.9998046875, |
|
"rewards/tag_count_reward/std": 0.00078125, |
|
"step": 530 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 334.6, |
|
"completions/max_terminated_length": 334.6, |
|
"completions/mean_length": 224.428125, |
|
"completions/mean_terminated_length": 224.428125, |
|
"completions/min_length": 143.85, |
|
"completions/min_terminated_length": 143.85, |
|
"epoch": 0.9433546396297112, |
|
"grad_norm": 0.26616721247221914, |
|
"kl": 0.024881744384765626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0114, |
|
"num_tokens": 80588906.0, |
|
"reward": 0.8039220564067364, |
|
"reward_std": 0.08752607379574329, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7693200555630029, |
|
"rewards/qatch_metrics/std": 0.10297185693052598, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 337.025, |
|
"completions/max_terminated_length": 337.025, |
|
"completions/mean_length": 222.8765625, |
|
"completions/mean_terminated_length": 222.8765625, |
|
"completions/min_length": 146.35, |
|
"completions/min_terminated_length": 146.35, |
|
"epoch": 0.9521710381309235, |
|
"grad_norm": 0.26056623342737706, |
|
"kl": 0.023919677734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0073, |
|
"num_tokens": 81348428.0, |
|
"reward": 0.8161520700901747, |
|
"reward_std": 0.06797666533384472, |
|
"rewards/format_reward/mean": 0.9984375, |
|
"rewards/format_reward/std": 0.00625, |
|
"rewards/qatch_metrics/mean": 0.7839151054620743, |
|
"rewards/qatch_metrics/std": 0.07955915104830638, |
|
"rewards/tag_count_reward/mean": 0.999609375, |
|
"rewards/tag_count_reward/std": 0.0015625, |
|
"step": 540 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 329.45, |
|
"completions/max_terminated_length": 329.45, |
|
"completions/mean_length": 222.23515625, |
|
"completions/mean_terminated_length": 222.23515625, |
|
"completions/min_length": 145.9375, |
|
"completions/min_terminated_length": 145.9375, |
|
"epoch": 0.9609874366321358, |
|
"grad_norm": 0.25072778848110355, |
|
"kl": 0.02654876708984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0051, |
|
"num_tokens": 82112537.0, |
|
"reward": 0.8805699178948998, |
|
"reward_std": 0.07740147057920695, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8594940162263811, |
|
"rewards/qatch_metrics/std": 0.09106055875308812, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 326.8125, |
|
"completions/max_terminated_length": 326.8125, |
|
"completions/mean_length": 216.265625, |
|
"completions/mean_terminated_length": 216.265625, |
|
"completions/min_length": 142.5, |
|
"completions/min_terminated_length": 142.5, |
|
"epoch": 0.969803835133348, |
|
"grad_norm": 0.25948373422406573, |
|
"kl": 0.024462890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0006, |
|
"num_tokens": 82872173.0, |
|
"reward": 0.8117451569065451, |
|
"reward_std": 0.0860868067946285, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7785237034782767, |
|
"rewards/qatch_metrics/std": 0.10127860223874449, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 337.1875, |
|
"completions/max_terminated_length": 337.1875, |
|
"completions/mean_length": 216.21640625, |
|
"completions/mean_terminated_length": 216.21640625, |
|
"completions/min_length": 137.8125, |
|
"completions/min_terminated_length": 137.8125, |
|
"epoch": 0.9786202336345603, |
|
"grad_norm": 0.3645941597990175, |
|
"kl": 0.02496337890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0009, |
|
"num_tokens": 83634770.0, |
|
"reward": 0.8101593714207411, |
|
"reward_std": 0.08605160953738959, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7766580768162384, |
|
"rewards/qatch_metrics/std": 0.10123719241237268, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 343.25, |
|
"completions/max_terminated_length": 343.25, |
|
"completions/mean_length": 231.0015625, |
|
"completions/mean_terminated_length": 231.0015625, |
|
"completions/min_length": 151.325, |
|
"completions/min_terminated_length": 151.325, |
|
"epoch": 0.9874366321357725, |
|
"grad_norm": 0.21758575977885092, |
|
"kl": 0.023052978515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0066, |
|
"num_tokens": 84398692.0, |
|
"reward": 0.8315089832991361, |
|
"reward_std": 0.07600237202714197, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8017752626910806, |
|
"rewards/qatch_metrics/std": 0.08941456201137044, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 355.1375, |
|
"completions/max_terminated_length": 355.1375, |
|
"completions/mean_length": 232.215625, |
|
"completions/mean_terminated_length": 232.215625, |
|
"completions/min_length": 148.6875, |
|
"completions/min_terminated_length": 148.6875, |
|
"epoch": 0.9962530306369848, |
|
"grad_norm": 0.2378336659692683, |
|
"kl": 0.023166656494140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0083, |
|
"num_tokens": 85155592.0, |
|
"reward": 0.7450484920293092, |
|
"reward_std": 0.07769823344424368, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.7000570335425437, |
|
"rewards/qatch_metrics/std": 0.09140969021318597, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 290.0625, |
|
"completions/max_terminated_length": 290.0625, |
|
"completions/mean_length": 187.46484375, |
|
"completions/mean_terminated_length": 187.46484375, |
|
"completions/min_length": 124.65625, |
|
"completions/min_terminated_length": 124.65625, |
|
"epoch": 0.9997795900374697, |
|
"kl": 0.027322769165039062, |
|
"num_tokens": 85427878.0, |
|
"reward": 0.8777023833245039, |
|
"reward_std": 0.08281319939123932, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"rewards/qatch_metrics/mean": 0.8561204457655549, |
|
"rewards/qatch_metrics/std": 0.09742730065772776, |
|
"rewards/tag_count_reward/mean": 1.0, |
|
"rewards/tag_count_reward/std": 0.0, |
|
"step": 567, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0013445673257480432, |
|
"train_runtime": 158207.651, |
|
"train_samples_per_second": 0.057, |
|
"train_steps_per_second": 0.004 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 567, |
|
"num_input_tokens_seen": 85427878, |
|
"num_train_epochs": 1, |
|
"save_steps": 57, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|